# Importing Data and Librairies

In [1]:
import pandas as pd 
from nltk import sent_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time

In [2]:
data = pd.read_csv("data/plot_genres.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,plot,wiki_id,movie_name,release_date,box_office,duration,language,country,...,Other,Propaganda,Religious,Romance,Science Fiction,Silent,Sports,Thriller,War,Western
0,0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",23890098,Taxi Blues,1990-09-07,,110.0,Russian Language,"['France', 'Soviet Union', 'Russia']",...,0,0,0,0,0,0,0,0,0,0
1,1,31186339,The nation of Panem consists of a wealthy Capi...,31186339,The Hunger Games,2012-03-12,686533290.0,142.0,English Language,United States of America,...,0,0,0,0,0,0,0,0,0,0
2,2,20663735,Poovalli Induchoodan is sentenced for six yea...,20663735,Narasimham,2000-01-01,,175.0,Malayalam Language,India,...,0,0,0,0,0,0,0,0,0,0
3,3,2231378,"The Lemon Drop Kid , a New York City swindler,...",2231378,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,English Language,United States of America,...,0,0,0,0,0,0,0,0,0,0
4,4,595909,Seventh-day Adventist Church pastor Michael Ch...,595909,A Cry in the Dark,1988-11-03,6908797.0,121.0,English Language,"['United States of America', 'Australia', 'New...",...,0,0,0,0,0,0,0,0,0,0


# Sentiment Analysis Models

In [3]:
as_algos = []
text_plot = data["plot"][0]
text_plot

"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."

## Model Selection

### HuggingFace

In [4]:
access_token = "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"

#### Moview Review Sentiment Analysis

- 190 downloads
- Dataset not available
- Accuracy: 0.950

https://huggingface.co/JamesH/Movie_review_sentiment_analysis_model?text=I+love+AutoTrain

In [5]:
model_1 = AutoModelForSequenceClassification.from_pretrained("JamesH/autotrain-third-project-1883864250", use_auth_token=access_token)
tokenizer_1 = AutoTokenizer.from_pretrained("JamesH/autotrain-third-project-1883864250", use_auth_token=access_token)

In [6]:
def sent_hf_1(x:str):
    """ Model 1 """
    inputs = tokenizer_1(x, return_tensors="pt")
    return model_1(**inputs)["logits"].tolist()[0]
print(sent_hf_1(text_plot))
as_algos.append(sent_hf_1)

[-0.11514227092266083, 0.0033993793185800314]


#### IMDb Sentiment Analysis

(+) IMDb Sentiment Analysis
(-) Low downloads

https://huggingface.co/pierric/autonlp-my-own-imdb-sentiment-analysis-2131817

In [7]:
model_2 = AutoModelForSequenceClassification.from_pretrained("pierric/autonlp-my-own-imdb-sentiment-analysis-2131817", use_auth_token=access_token)
tokenizer_2 = AutoTokenizer.from_pretrained("pierric/autonlp-my-own-imdb-sentiment-analysis-2131817", use_auth_token=access_token)

In [8]:
def sent_hf_2(x:str):
    """ Model 2 """
    inputs = tokenizer_2(x, return_tensors="pt")
    return model_2(**inputs)["logits"].tolist()[0]
print(sent_hf_2(text_plot))
as_algos.append(sent_hf_2)

[-3.0913257598876953, 3.3504817485809326]


#### SiEBERT - English-Language Sentiment Classification

(+) Global
(-) Not movie specific

https://huggingface.co/siebert/sentiment-roberta-large-english

In [9]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
print(sentiment_analysis(text_plot))
as_algos.append(sentiment_analysis)

[{'label': 'POSITIVE', 'score': 0.9980297684669495}]


### Existing Librairies

#### Blob

- Rule Based
- Not good with Negation

In [10]:
TextBlob(data["plot"][0]).sentiment

Sentiment(polarity=0.2, subjectivity=0.6)

#### Vader

- VADER works better when it comes to negative sentiment classification.

In [11]:
sid_obj= SentimentIntensityAnalyzer()
print(sid_obj.polarity_scores("no slow motion camera")) 
as_algos.append(sid_obj.polarity_scores)

{'neg': 0.423, 'neu': 0.577, 'pos': 0.0, 'compound': -0.296}


## Impementation 

In [12]:
sent_f = sid_obj.polarity_scores
sentiment_df = data[["movie_name","plot","movie_id","wiki_id"]].copy()
# generating Sentiment Analysis Columns
sentiment_df["polarity_scores"] = sentiment_df["plot"].apply(sent_f)

In [13]:
for algo in as_algos:
    print(algo)
    now = time.time()
    sentiment_df["polarity_scores"] = sentiment_df["plot"].apply(sent_f)
    print("lasted: ",  str(now-time.time()))
sentiment_df = pd.concat([sentiment_df, pd.json_normalize(sentiment_df.polarity_scores)], axis=1).drop(["polarity_scores","plot"], axis=1)
sentiment_df.head()

<function sent_hf_1 at 0x7f7bd894ae50>
lasted:  -299.5952684879303
<function sent_hf_2 at 0x7f7bd67ef940>
lasted:  -306.1944274902344
<transformers.pipelines.text_classification.TextClassificationPipeline object at 0x7f7bd88e6fa0>
lasted:  -300.3594512939453
<bound method SentimentIntensityAnalyzer.polarity_scores of <vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer object at 0x7f7bbfb679d0>>
lasted:  -297.87586641311646


Unnamed: 0,movie_name,movie_id,wiki_id,neg,neu,pos,compound
0,Taxi Blues,23890098,23890098,0.083,0.832,0.084,0.0083
1,The Hunger Games,31186339,31186339,0.132,0.791,0.077,-0.9941
2,Narasimham,20663735,20663735,0.178,0.704,0.119,-0.9867
3,The Lemon Drop Kid,2231378,2231378,0.111,0.786,0.103,-0.6127
4,A Cry in the Dark,595909,595909,0.108,0.825,0.067,-0.9538


In [14]:
sentiment_df.to_csv("data/plot_sentiment.csv", index = True)