### Import and installations

In [1]:
import numpy as np
import bz2
import json
import os.path
import pandas as pd


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tld
!pip install flair
!pip install pyarrow

In [6]:
#TODO : change "/content/drive/MyDrive/Colab Notebooks/ada/"
path_files = "/content/drive/MyDrive/Colab Notebooks/ada/"

path_us = path_files + "us_data/Filtered data/"

### Tools

In [7]:
#Flair model using BERT
def sia_flair_1():
    start_time = time.time()
    print("Loading flair model...")
    sia = TextClassifier.load('en-sentiment')
    print("Loading model done in  %s seconds." % (time.time() - start_time))
    return sia

def pred_flair_1(x, sia, batch_size):
    x_np = x.to_numpy()
    sentences = list(np.vectorize(lambda a : Sentence(a))(x_np))
    sia.predict(sentences,return_probabilities_for_all_classes=True, mini_batch_size=batch_size)
    scores = [ -1 * sent.labels[0].score + sent.labels[1].score for sent in sentences]
    return np.array(scores)

#Flair model using RNN : Faster
def sia_flair_2():
    start_time = time.time()
    print("Loading flair model...")
    sia = TextClassifier.load('sentiment-fast')
    print("Loading model done in  %s seconds." % (time.time() - start_time))
    return sia

def pred_flair_2(x, sia, batch_size):
    x_np = x.to_numpy()
    sentences = list(np.vectorize(lambda a : Sentence(a))(x_np))
    sia.predict(sentences,return_probabilities_for_all_classes=True, mini_batch_size=batch_size)
    scores = [ -1 * sent.labels[1].score + sent.labels[0].score for sent in sentences]

    return np.array(scores)

In [8]:
'''
print accuracy given predictions and the test set
'''
def accuracy(test, output):
    res = (test == output).sum()
    print("accuracy : " + str(res / len(test)) + " %")
    return res / len(test)

'''
print the mean square error given the prediction and the test set
'''
def mse(test, predictions):
    res = ((test - predictions)**2).sum() / len(test)
    print('mse : ' + str(res))
    return res

'''
given predictions between [-1, 1], return labels {-1, 0, 1}
uniform : if the range is equally splitted : 
-1 if [-1, -1/3]
0 if [-1/3, 1/3]
-1 if [1/3, 1]
'''
def classify(predictions, uniform=False):
    if uniform :
        return predictions.vectorize(lambda x: -1 if x < -1/3 else 0 if x < 1/3 else 1)   
    else :
        return np.round(predictions)

### Preprocess

In [9]:
#Load test sample manually labelled
test_sample = pd.read_csv(path_us + "sample_quotes_labelled.csv")  

In [10]:
#Loading flair model BERT
sia1 = sia_flair_1()

Loading flair model...
2021-12-13 09:24:04,482 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmptaz8uy_m


100%|██████████| 265512723/265512723 [00:29<00:00, 8975861.41B/s]

2021-12-13 09:24:34,903 copying /tmp/tmptaz8uy_m to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-12-13 09:24:35,426 removing temp file /tmp/tmptaz8uy_m
2021-12-13 09:24:35,464 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Loading model done in  56.46393418312073 seconds.


In [11]:
#Loading flair model RNN
sia2 = sia_flair_2()

Loading flair model...
2021-12-13 09:25:00,977 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-fasttext-rnn/sentiment-en-mix-ft-rnn_v8.pt not found in cache, downloading to /tmp/tmplbweqtss


100%|██████████| 1241977025/1241977025 [02:13<00:00, 9281532.97B/s]

2021-12-13 09:27:15,664 copying /tmp/tmplbweqtss to cache at /root/.flair/models/sentiment-en-mix-ft-rnn_v8.pt





2021-12-13 09:27:20,108 removing temp file /tmp/tmplbweqtss
2021-12-13 09:27:20,277 loading file /root/.flair/models/sentiment-en-mix-ft-rnn_v8.pt
Loading model done in  150.95562291145325 seconds.


### Accuracy comparison

In [14]:
models_name = ["Flair", "Flair_fast"]
models_sia = [sia1, sia2]
models_pred = [pred_flair_1, pred_flair_2]

for i in range(len(models_name)):
    print(models_name[i] + ' Analysis')
    print("===========================")

    pred = models_pred[i]
    #predict sentiment foreach quotation in the test set
    #-1 : negative, 0 : neutral, 1 positive
    predictions = pred(test_sample.quotation, models_sia[i], batch_size=16)
    
    mse(test_sample["sentiment"], predictions)
    
    sentiments = classify(predictions)
    accuracy(test_sample["sentiment"], sentiments)

    print()
    

Flair Analysis
mse : 0.4684054682271506
accuracy : 0.66 %

Flair_fast Analysis
mse : 0.5881031352434072
accuracy : 0.56 %



### Runtime comparison

Note : running on GPU

In [22]:
df_sample = pd.read_pickle(path_us + "us_2020.pkl.bz2",compression='bz2')[['quoteID', 'quotation']].sample(10000)
len(df_sample)

10000

In [23]:
sizes = [8, 16, 32, 64, 128]
for s in sizes: 
    print("mini-batch size : " + str(s))
    start_time = time.time()
    predictions = pred_flair_1(df_sample.quotation, sia1, batch_size=s)
    
    print("Runtime flair:  %s seconds " % (time.time() - start_time))
    
    start_time = time.time()
    predictions = pred_flair_2(df_sample.quotation, sia2, batch_size=s)
    print("Runtime flair_fast:  %s seconds " % (time.time() - start_time))
    print()

mini-batch size : 8
Runtime flair:  53.888055086135864 seconds 
Runtime flair_fast:  20.321967124938965 seconds 

mini-batch size : 16
Runtime flair:  48.28710865974426 seconds 
Runtime flair_fast:  18.333975076675415 seconds 

mini-batch size : 32
Runtime flair:  46.5600368976593 seconds 
Runtime flair_fast:  15.473497152328491 seconds 

mini-batch size : 64
Runtime flair:  46.0074577331543 seconds 
Runtime flair_fast:  15.76470422744751 seconds 

mini-batch size : 128
Runtime flair:  49.31395673751831 seconds 
Runtime flair_fast:  14.79665207862854 seconds 



### Discussion

Despite the good accuracy that the first Flair model gives us, its runtime with the computational power we have make its use not practicle.  
For example, the filtered dataset of 2017 has more than 1'000'000 quotes, to process it one should wait around 1 hour 30, and this is just for one year.  
For this reason we have chosen the Flair fast model that runs around 4 times faster.