In [None]:
from pysentimiento import create_analyzer
import warnings
from transformers import pipeline
import pandas as pd
warnings.filterwarnings("ignore")

In [None]:
imdb100 = pd.read_csv("data/imdb100.csv")
imdb100.groupby("id").size().head(5)

In [None]:
# get all reviews 1-5
top_auth = imdb100[(imdb100["id"] >= 0) & (imdb100["id"] <= 4)]
# sort by id, ascending
top_auth = top_auth.sort_values(by="id", ascending=True)
top_auth = top_auth[["title", "text", "rating"]].reset_index(drop=True)
top_auth.head(5)

# Analysis  

### Sentiment
We can use the rating as well as a sentiment analysis by BERTweet and similar pre-trained transformer models to determine the sentiment of the review.

In [None]:
sent_analyzer = create_analyzer(task="sentiment", lang="en")
emt_analyzer = create_analyzer(task="emotion", lang="en")

In [None]:
print("For the review \n " + top_auth["text"].iloc[0] + "\n")
print("The sentiment model predicted: " + str(sent_analyzer.predict(top_auth["text"].iloc[0])))
print("The rating was: " + str(top_auth["rating"].iloc[0]) + " out of 10.0")
print("The emotion model predicted: " + str(emt_analyzer.predict(top_auth["text"].iloc[0])))

In [None]:
print("For the review \n " + top_auth["text"].iloc[2] + "\n")
print("The sentiment model predicted: " + str(sent_analyzer.predict(top_auth["text"].iloc[2])))
print("The rating was: " + str(top_auth["rating"].iloc[2]) + " out of 10.0")
print("The emotion model predicted: " + str(emt_analyzer.predict(top_auth["text"].iloc[2])))

In [None]:
def get_output(pred):
    return pred.output

In [None]:
top_auth['sentiment'] = top_auth['text'].apply(sent_analyzer.predict).apply(get_output)

In [None]:
top_auth['emotion'] = top_auth['text'].apply(emt_analyzer.predict).apply(get_output)

In [None]:
top_auth.head(5)

### Content
Here we perform named entity recognition, to identify important entities in the original review which we can pass on.

In [None]:
ner_analyzer = create_analyzer("ner", lang="en")

In [None]:
top_auth_sent = top_auth.copy()

In [None]:
print("For the review \n " + top_auth_sent["text"].iloc[3] + "\n")
print("The NER model predicted: " + str(ner_analyzer.predict(top_auth_sent["text"].iloc[3]).entities))

In [None]:
top_auth_sent['NER'] = top_auth_sent['text'].apply(ner_analyzer.predict)
top_auth_sent.head(5)

### Abstractive Text Summarization (ATS)

In [None]:
summarizer = pipeline("summarization", model="google/flan-t5-base")

In [None]:
def summarize_text(text):
    return summarizer(text, max_length=100, min_length=30, do_sample=False)

def get_summary(text):
    return text[0]["summary_text"]

In [None]:
test_samples = top_auth_sent.head(5)
test_samples['summary'] = test_samples['text'].apply(summarize_text).apply(get_summary)

In [None]:
print(test_samples['text'].iloc[4])

In [None]:
print(test_samples['summary'].iloc[4])

In [None]:
top_auth_sent['summary'] = top_auth_sent['text'].apply(summarize_text).apply(get_summary)

### Stylometric
Here we want to collect information about the length of each review. To copy the original author as close as possible.


In [None]:
def get_length(text:str):
    return len(text.split())

top_auth_cont = top_auth_sent.copy()

top_auth_cont['length'] = top_auth_sent['text'].apply(get_length)
print(top_auth_cont.head(5))

In [None]:
top_auth_cont.to_pickle('data/top_auth_final.pkl')