# Sentiment Analysis Models comparison 


Note :  
need install tld, nltk, flair

In [1]:
import bz2
import json
import os.path

import pandas as pd
import time

In [2]:
#Sentiment analysis libraries
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from textblob import TextBlob

from flair.models import TextClassifier
from flair.data import Sentence

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jonathan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Create sample set

In [3]:
data_path = "data/quotes-"
name_extension = "-domains.json.bz2"
path_2020 = data_path + "2020" + name_extension
print(path_2020)

data/quotes-2020-domains.json.bz2


In [4]:
dfs = [] 
#Read all dataset to extract sample dataset
for year in range(2015, 2020 + 1):
    file_name = data_path + str(year) + name_extension
    data = pd.read_json(file_name, lines=True, compression='bz2') 
    dfs.append(data) # append the data frame to the list

df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.

samples = df.sample(n=50, random_state=1)

samples['quotation'].to_csv("sample_quotes.csv", index=True)
print("done")

In [3]:
#Load labelled quotations
test_sample = pd.read_csv("sample_quotes_labelled.csv")  

### Analysis

In [4]:
'''
print accuracy given predictions and the test set
'''
def accuracy(test, output):
    res = (test == output).sum()
    print("accuracy : " + str(res / len(test)) + " %")
    return res / len(test)

'''
print the mean square error given the prediction and the test set
'''
def mse(test, predictions):
    res = ((test - predictions)**2).sum() / len(test)
    print('mse : ' + str(res))
    return res

'''
given predictions between [-1, 1], return labels {-1, 0, 1}
uniform : if the range is equally splitted : 
-1 if [-1, -1/3]
0 if [-1/3, 1/3]
-1 if [1/3, 1]
'''
def classify(predictions, uniform=False):
    if uniform :
        return predictions.apply(lambda x: -1 if x < -1/3 else 0 if x < 1/3 else 1)   
    else :
        return round(predictions)

In [5]:
#sentiment analysis object initialization of each model
sia_nlkt = lambda  : SentimentIntensityAnalyzer()
sia_blob = lambda : None

def sia_flair():
    start_time = time.time()
    print("Loading flair model...")
    sia = TextClassifier.load('sentiment-fast')
    print("Loading model done in  %s seconds." % (time.time() - start_time))
    return sia

#sentiment analysis prediction of each model : x being the quote
pred_nlkt = lambda x, sia: sia.polarity_scores(x)["compound"]
pred_blob = lambda x, _: TextBlob(x).sentiment.polarity

def pred_flair(x, sia):
    sentence = Sentence(x)
    sia.predict(sentence,return_probabilities_for_all_classes=True, mini_batch_size=1)
    score = -1 * sentence.labels[0].score + sentence.labels[1].score
    return score

In [7]:
#List of models to iterate    
models_name = ["NLTK", "TextBlob", "Flair"]
models_sia = [sia_nlkt, sia_blob, sia_flair]
models_pred = [pred_nlkt, pred_blob, pred_flair]

for i in range(3):
    print(models_name[i] + ' Analysis')
    print("===========================")
    sia = models_sia[i]()
    start_time = time.time()
    pred = models_pred[i]
    #predict sentiment foreach quotation in the test set
    #-1 : negative, 0 : neutral, 1 positive
    predictions = test_sample.quotation.apply(lambda x : pred(x, sia))
    
    mse(test_sample["sentiment"], predictions)
    
    sentiments = classify(predictions)
    accuracy(test_sample["sentiment"], sentiments)
    
    print("Runtime :  %s seconds " % (time.time() - start_time))
    print()
    

NLTK Analysis
mse : 0.5337384212
accuracy : 0.54 %
Runtime :  0.02494192123413086 seconds 

TextBlob Analysis
mse : 0.5716744422026094
accuracy : 0.42 %
Runtime :  0.021116256713867188 seconds 

Flair Analysis
Loading flair model...
2021-12-17 21:40:24,166 loading file /Users/jonathan/.flair/models/sentiment-en-mix-ft-rnn_v8.pt
Loading model done in  7.427731990814209 seconds.
mse : 2.0846241593628605
accuracy : 0.16 %
Runtime :  0.3076648712158203 seconds 



### Note
The runtime comparison was made using CPU. Because the flair model uses neural networks internally, it will have greater performances on GPUs. For further analysis of Flair, please refer to the flairs_analysis.ipynb notebook.