In [1]:
# import the required libraries. Note, in order to be able to run this cell you must have installed the libraires listed 
# in the requirements.txt file

import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures, pipeline
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
%matplotlib inline
import logging
from importlib import reload
from tqdm import tqdm
import collections
import json
import bz2


2021-11-12 17:09:41.687551: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-12 17:09:41.687589: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Load the dataset
In the pipeline, the sentiment classification task is after filtering out irrelevant sentences. We thus need to load the dataset back into memory.

In [2]:
sentences = []
k = 0
with bz2.open('data/final_filtered.json.bz2', 'rb') as s_file:
    while True:
        try:
            d =  json.loads(next(s_file))
            # print(d['quotation'])
            sentences.append(d)
            k+=1
            if k == 1000:
                break
        except StopIteration:
            break

df = pd.DataFrame(sentences)

In [3]:
# Print the first few quotations as a sanity check that the data were loaded correctly
df['quotation'].head()

0    However, due to sharp decline in KG-D6 gas pro...
1    [ Young savers' ] parents and grandparents are...
2    However, we cannot share any specific details ...
3    HUD did not drop the complaint, but insisted t...
4    2014 was a milestone year for us on many front...
Name: quotation, dtype: object

In [4]:
# inspect the data to see if the loading worked as expected
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,urls,tokenized,cosine_similarity
0,2015-03-26-025269,"However, due to sharp decline in KG-D6 gas pro...",Piyush Goyal,[Q7199798],2015-03-26 10:02:46,1,[http://timesofindia.indiatimes.com/business/i...,howev due sharp declin kgd6 ga product could g...,0.251782
1,2015-10-28-001053,[ Young savers' ] parents and grandparents are...,Patrick Connolly,[Q7146267],2015-10-28 07:26:15,1,[http://gulfnews.com/business/sectors/features...,young saver parent grandpar use higher interes...,0.0
2,2015-02-19-025137,"However, we cannot share any specific details ...",David Kalisch,[Q26322384],2015-02-19 00:26:19,5,[http://www.smh.com.au/federal-politics/politi...,howev share specif detail stage we provid info...,0.170075
3,2015-05-16-013190,"HUD did not drop the complaint, but insisted t...",Dennis Wheeler,[Q55219988],2015-05-16 21:23:57,1,[http://adn.com/article/20150514/anchorage-cha...,hud drop complaint insist still issu code\n,0.28624
4,2015-02-13-000112,2014 was a milestone year for us on many front...,Mike Fries,[Q54861319],2015-02-13 13:01:44,1,[http://advanced-television.com/2015/02/13/lib...,2014 mileston year us mani front we increas pa...,0.280876


### BERT pre-trained sentiment classifier
The first model we try to employ in order to perform the sentiment classification of our unlabled set of quotations is the pre-trained BERT model. To do so we emply the transformers library which uses tensorflow and which gives direct access to the `bert-based-incased` model.

In [5]:
# load the pre-trained BERT model as well as the required tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

2021-11-12 17:41:49.657272: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-12 17:41:49.657527: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-12 17:41:49.657701: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (noto.epfl.ch): /proc/driver/nvidia/version does not exist
2021-11-12 17:41:49.660042: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers 

In [6]:
# display the properties of the laoded BERT model as sanity check that the loading of the model functioned correctly
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Test if the tokenizer works
sample_txt = "Are we able to predict the stock market using the sentiment expressed by famous people? Adadelta-Q is here to answer your question"
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f'  Sentence: {sample_txt}')
print(f'\n  Tokens: {tokens}')
print(f'\n  Token IDs: {token_ids}')

  Sentence: Are we able to predict the stock market using the sentiment expressed by famous people? Adadelta-Q is here to answer your question

  Tokens: ['are', 'we', 'able', 'to', 'predict', 'the', 'stock', 'market', 'using', 'the', 'sentiment', 'expressed', 'by', 'famous', 'people', '?', 'ada', '##del', '##ta', '-', 'q', 'is', 'here', 'to', 'answer', 'your', 'question']

  Token IDs: [2024, 2057, 2583, 2000, 16014, 1996, 4518, 3006, 2478, 1996, 15792, 5228, 2011, 3297, 2111, 1029, 15262, 9247, 2696, 1011, 1053, 2003, 2182, 2000, 3437, 2115, 3160]


In [8]:
#Try tokenization of one sentence in dataframe
sample_txt = df.iloc[0]['quotation']
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f'  Sentence: {sample_txt}')
print(f'\n  Tokens: {tokens}')
print(f'\n  Token IDs: {token_ids}')

  Sentence: However, due to sharp decline in KG-D6 gas production not only could gas not be allocated, to new gas-based projects, but the commissioned capacity that KG-D6 gas allocation also get stranded,

  Tokens: ['however', ',', 'due', 'to', 'sharp', 'decline', 'in', 'kg', '-', 'd', '##6', 'gas', 'production', 'not', 'only', 'could', 'gas', 'not', 'be', 'allocated', ',', 'to', 'new', 'gas', '-', 'based', 'projects', ',', 'but', 'the', 'commissioned', 'capacity', 'that', 'kg', '-', 'd', '##6', 'gas', 'allocation', 'also', 'get', 'stranded', ',']

  Token IDs: [2174, 1010, 2349, 2000, 4629, 6689, 1999, 4705, 1011, 1040, 2575, 3806, 2537, 2025, 2069, 2071, 3806, 2025, 2022, 11095, 1010, 2000, 2047, 3806, 1011, 2241, 3934, 1010, 2021, 1996, 4837, 3977, 2008, 4705, 1011, 1040, 2575, 3806, 16169, 2036, 2131, 15577, 1010]


Since running the BERT model can be very costly in terms of running time, we decide to test its performance on the sentiment classification task, WITHOUT fine-tuning it, by using three artificial sentences whose polarities are relatively obvious (to a human) and span all the possible polarity categories (positive, negative, and neutral).

In [9]:
# Create the two artifical sentences
pred_sentences = ['The performance of that company was incredible, the revenue will be increasing constantly over the next years',
                  'Their financial stability has been questioned multiple times, they are on the edge of bankruptcy',
                  'The market will probably be unaffected by these events']

In [11]:
# Get the sentiment prediction using the pre_trained BERT model
tokenized_sentences = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
outputs = model(tokenized_sentences)

# use softmax to obtain values interpretable as probabilities
sentiments = tf.nn.softmax(outputs[0], axis=-1)
print(sentiments)

# print the obtained sentiment classification 
labels = ['Negative','Positive']
label = tf.argmax(sentiments, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": \n", labels[label[i]])

tf.Tensor(
[[0.33168676 0.6683132 ]
 [0.32882562 0.67117435]
 [0.41989222 0.5801078 ]], shape=(3, 2), dtype=float32)
The performance of that company was incredible, the revenue will be increasing constantly over the next years : 
 Positive
Their financial stability has been questioned multiple times, they are on the edge of bankruptcy : 
 Positive
The market will probably be unaffected by these events : 
 Positive


We notice that even with very simple and relatively obvious sentences, the pre-trained BERT model does not seem to detect the correct polarity. This is not surprising since it has been shown in the literature that fine-tuning is a critical step in order to be able to employ this pre-trained classifier. A possible solution would be to manually label a certain amount of quotations that wil be used to fine-tune the BERT model. However, this soltuon would be very time-consuming, therefore we firt explore other unsupervised methods.

### VADER unsupervised sentiment classifier

VADER is a famous unsupervised sentiment classifier that can be imported from the NLTK library. It makes use of a lexicon where each word is associated with a sentiment score and the the sentence sentiment is obtained by aggregating the word-specific polarities. 
It is an interesting method to explore because it is very easy to implement and does not require any label as an input. On the other side, though, its sentiment classification results have been shown to be quite poor. Once again, to test this we use the artificial sentences we have create before.

In [12]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /home/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [17]:
# initialize the VADER sentiment classifier
analyzer = SentimentIntensityAnalyzer()


def vader_sentiment_result(sent):
    scores = analyzer.polarity_scores(sent)
    if scores["neg"] > scores["pos"] and scores["neg"] > scores["pos"]:
        return 'negative'
    elif scores["pos"] > scores["neg"] and scores["pos"] > scores["neu"]:
        return 'positive'
    return 'neutral'

In [18]:
for sentence in pred_sentences:
    sentiment = vader_sentiment_result(sentence)
    print(sentence, ": \n", sentiment)

The performance of that company was incredible, the revenue will be increasing constantly over the next years : 
 neutral
Their financial stability has been questioned multiple times, they are on the edge of bankruptcy : 
 negative
The market will probably be unaffected by these events : 
 neutral


It can be noticed that VADER is able to detect the negative polarity expresed in one sentence, but also this model does not perform as well as we hoped for. Having as precise as possible sentiments is key to avoid biasing the results of the projects significnatly. We therefore look further for another solution.

### Zero-shot classifier

In the past years, a large attention has been dedicated to the so called "Zero-Shot NLP models", which are models pre-trained on different tasks but that are able to perform well on unseen and unlabled data without needing fine-tuning, which is what we need! We use the Transformers library which employs models from the Hugging Face hub. The models available in the Transformers library are trained using the Natural Language Inference (NLI) approach, and thus require for each sentence a premise and an hypothesis to be tested. In our case the hypothesis are the sentiment and thus we set the hypothesis to be "The sentiment of this quote is positive/neutral/negative". As before, we use the three artificial sentences previously created in order to have a first quick analysis of the classification perfromance of the model. 

In [None]:
# Intitialize the zer-shot classifier (it will use the default model robert-large-mnli)
classifier = pipeline("zero-shot-classification")

# Crete the hypothesis we want to use
hypotheses = "The sentiment of this quote is {}."

# Create the labels
the_labels = ["positive", "negative", "neutral"]

In [20]:
for sentence in pred_sentences:
    sentiment = classifier(sentence, the_labels, hypothesis_template = hypotheses, multi_label=True)
    print(sentiment, "\n", 'The predicted  sentiment is: ', sentiment['labels'][0])

{'sequence': 'The performance of that company was incredible, the revenue will be increasing constantly over the next years', 'labels': ['positive', 'neutral', 'negative'], 'scores': [0.9952166080474854, 0.3805881142616272, 0.0035103177651762962]} 
 The predicted  sentiment is:  positive
{'sequence': 'Their financial stability has been questioned multiple times, they are on the edge of bankruptcy', 'labels': ['negative', 'neutral', 'positive'], 'scores': [0.9933288097381592, 0.11964941024780273, 0.005131530575454235]} 
 The predicted  sentiment is:  negative
{'sequence': 'The market will probably be unaffected by these events', 'labels': ['positive', 'neutral', 'negative'], 'scores': [0.9269043207168579, 0.8344512581825256, 0.01127589587122202]} 
 The predicted  sentiment is:  positive


The results above show that this model is better than the pre-trained BERT and The VADER models. It is able to correctly detect the negative and positive sentiments. The sentence that has a more neutral senitment was categroized as positive. The more neutral sentence was categorized as positive, however we notice that the score assigned to the neutral class was also relatively high. Moreover, detecting neutral sentence has always been the most difficult part for sentiment classifiers since it is sometimes very hard to draw a fine line between was is positive/negative and what is neutral. It has been shown that the selection of the working of the hypothesis has an influence on the performance so we might need to compare different formulations as well.

Given the exeperiments above, we decide to explore fruther only the VADER and the Zero-SHot classifiers. To do so, we run both of them on 1000 random quotations from those filtered from 2015 and we then manually go through them to see which classifier resonates more with what a human would choose. The resulting dataframe is saved within `data` for possible future purposes. 

In [35]:
hypothesis_template = "The sentiment of this quote is {}."
the_labels = ["positive", "negative", "neutral"]

vader_sent = []
one_shot_sent = []

# For each sentence, predict the sentiment using both VADER and Zero-Shot and save the sentiment in a column of the dataframe
for idx, item in tqdm(df.iterrows()):
    vader_sent.append(vader_sentiment_result(item['quotation']))
    one_shot_sent.append(classifier(item['quotation'], the_labels, hypothesis_template = hypotheses, multi_label=True)['labels'][0])

df['vader sentiment'] = vader_sent
df['one-shot sentiment'] = one_shot_sent

1000it [2:43:14,  9.79s/it]


In [36]:
# sanity check, print the first few quotations
df.head()


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,urls,tokenized,cosine_similarity,vader sentiment,one-shot sentiment
0,2015-03-26-025269,"However, due to sharp decline in KG-D6 gas pro...",Piyush Goyal,[Q7199798],2015-03-26 10:02:46,1,[http://timesofindia.indiatimes.com/business/i...,howev due sharp declin kgd6 ga product could g...,0.251782,neutral,negative
1,2015-10-28-001053,[ Young savers' ] parents and grandparents are...,Patrick Connolly,[Q7146267],2015-10-28 07:26:15,1,[http://gulfnews.com/business/sectors/features...,young saver parent grandpar use higher interes...,0.0,neutral,positive
2,2015-02-19-025137,"However, we cannot share any specific details ...",David Kalisch,[Q26322384],2015-02-19 00:26:19,5,[http://www.smh.com.au/federal-politics/politi...,howev share specif detail stage we provid info...,0.170075,neutral,negative
3,2015-05-16-013190,"HUD did not drop the complaint, but insisted t...",Dennis Wheeler,[Q55219988],2015-05-16 21:23:57,1,[http://adn.com/article/20150514/anchorage-cha...,hud drop complaint insist still issu code\n,0.28624,neutral,negative
4,2015-02-13-000112,2014 was a milestone year for us on many front...,Mike Fries,[Q54861319],2015-02-13 13:01:44,1,[http://advanced-television.com/2015/02/13/lib...,2014 mileston year us mani front we increas pa...,0.280876,neutral,positive


In [37]:
# For possible future purposes, we save the results 
compression_opts = dict(method='zip',
                        archive_name='sentiment_analysis_test.csv')  
df.to_csv('sentiment_analysis_test.zip', index=False,
          compression=compression_opts)  


Nevertheless, using the same 1000 quotations we used for testing VADER and using the default model roberta-large-mnli, we found a more intuitive and reasonable classification of the sentences' polarities. We will thus use this method to classify all the quotations. The running time will be significant (it took approximately 1 hour for 1000 sentences), but since we only need to do it once it is feasible.

After analyzing the results, we find that the Zero-shot model leads to a more intuitive and reasonable classification of the sentences' polarities and thus we decide to emply the Zero-shot classifier as our final method. It should be kept in mind that this sentiments are provided by an algorithm, thus we acknowledge that the results that will follow might be afected by possible biases created in this step. To (partially) avoid such biases we would need to do the sentiment classification ourselves, which would be unreasonable in the given time avaiable.

### Final Sentiment Detetion 

The next (and last) step is to run the same code we used above on all the quotations we would like to use fro the project. This step has not been performed yet because we noticed a bug in the filtering procedure and thus we need to rerun it.