In [1]:
# import the required libraries. Note, in order to be able to run this cell you must have installed the libraires listed 
# in the requirements.txt file

import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures
import matplotlib.pyplot as plt
%matplotlib inline
import logging
from importlib import reload
# from src import embeddings_filter
from tqdm import tqdm
import collections
import json
import bz2

2021-11-12 17:09:41.687551: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-12 17:09:41.687589: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Load the dataset
In the pipeline, the sentiment classification task is after filtering out irrelevant sentences. We thus need to load the dataset back into memory.

In [28]:
sentences = []
k = 0
with bz2.open('data/final_filtered.json.bz2', 'rb') as s_file:
    while True:
        try:
            d =  json.loads(next(s_file))
            # print(d['quotation'])
            sentences.append(d)
            k+=1
            if k == 1000:
                break
        except StopIteration:
            break

df = pd.DataFrame(sentences)

In [29]:
# Print the first few quotations as a sanity check that the data were loaded correctly
df['quotation'].head()

0    However, due to sharp decline in KG-D6 gas pro...
1    [ Young savers' ] parents and grandparents are...
2    However, we cannot share any specific details ...
3    HUD did not drop the complaint, but insisted t...
4    2014 was a milestone year for us on many front...
Name: quotation, dtype: object

In [31]:
# inspect the data to see if the loading worked as expected
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,urls,tokenized,cosine_similarity
0,2015-03-26-025269,"However, due to sharp decline in KG-D6 gas pro...",Piyush Goyal,[Q7199798],2015-03-26 10:02:46,1,[http://timesofindia.indiatimes.com/business/i...,howev due sharp declin kgd6 ga product could g...,0.251782
1,2015-10-28-001053,[ Young savers' ] parents and grandparents are...,Patrick Connolly,[Q7146267],2015-10-28 07:26:15,1,[http://gulfnews.com/business/sectors/features...,young saver parent grandpar use higher interes...,0.0
2,2015-02-19-025137,"However, we cannot share any specific details ...",David Kalisch,[Q26322384],2015-02-19 00:26:19,5,[http://www.smh.com.au/federal-politics/politi...,howev share specif detail stage we provid info...,0.170075
3,2015-05-16-013190,"HUD did not drop the complaint, but insisted t...",Dennis Wheeler,[Q55219988],2015-05-16 21:23:57,1,[http://adn.com/article/20150514/anchorage-cha...,hud drop complaint insist still issu code\n,0.28624
4,2015-02-13-000112,2014 was a milestone year for us on many front...,Mike Fries,[Q54861319],2015-02-13 13:01:44,1,[http://advanced-television.com/2015/02/13/lib...,2014 mileston year us mani front we increas pa...,0.280876


### BERT pre-trained sentiment classifier
The first model we try to employ in order to perform the sentiment classification of our unlabled set of quotations is the pre-trained BERT model. To do so we emply the transformers library which uses tensorflow and which gives direct access to the `bert-based-incased` model.

In [3]:
# load the pre-trained BERT model as well as the required tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

2021-11-08 16:28:17.851254: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-08 16:28:17.851470: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-08 16:28:17.851584: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (noto.epfl.ch): /proc/driver/nvidia/version does not exist
2021-11-08 16:28:17.852519: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers 

In [4]:
# display the properties of the laoded BERT model as sanity check that the loading of the model functioned correctly
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Test if the tokenizer works
sample_txt = "Are we able to predict the stock market using the sentiment expressed by famous people? Adadelta-Q is here to answer your question"
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f'  Sentence: {sample_txt}')
print(f'\n  Tokens: {tokens}')
print(f'\n  Token IDs: {token_ids}')

  Sentence: Are we able to predict the stock market using the sentiment expressed by famous people? Adadelta-Q is here to answer your question

  Tokens: ['are', 'we', 'able', 'to', 'predict', 'the', 'stock', 'market', 'using', 'the', 'sentiment', 'expressed', 'by', 'famous', 'people', '?', 'ada', '##del', '##ta', '-', 'q', 'is', 'here', 'to', 'answer', 'your', 'question']

  Token IDs: [2024, 2057, 2583, 2000, 16014, 1996, 4518, 3006, 2478, 1996, 15792, 5228, 2011, 3297, 2111, 1029, 15262, 9247, 2696, 1011, 1053, 2003, 2182, 2000, 3437, 2115, 3160]


In [23]:
#Try tokenization of one sentence in dataframe
sample_txt = df.iloc[0]['quotation']
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f'  Sentence: {sample_txt}')
print(f'\n  Tokens: {tokens}')
print(f'\n  Token IDs: {token_ids}')

  Sentence: However, due to sharp decline in KG-D6 gas production not only could gas not be allocated, to new gas-based projects, but the commissioned capacity that KG-D6 gas allocation also get stranded,

  Tokens: ['however', ',', 'due', 'to', 'sharp', 'decline', 'in', 'kg', '-', 'd', '##6', 'gas', 'production', 'not', 'only', 'could', 'gas', 'not', 'be', 'allocated', ',', 'to', 'new', 'gas', '-', 'based', 'projects', ',', 'but', 'the', 'commissioned', 'capacity', 'that', 'kg', '-', 'd', '##6', 'gas', 'allocation', 'also', 'get', 'stranded', ',']

  Token IDs: [2174, 1010, 2349, 2000, 4629, 6689, 1999, 4705, 1011, 1040, 2575, 3806, 2537, 2025, 2069, 2071, 3806, 2025, 2022, 11095, 1010, 2000, 2047, 3806, 1011, 2241, 3934, 1010, 2021, 1996, 4837, 3977, 2008, 4705, 1011, 1040, 2575, 3806, 16169, 2036, 2131, 15577, 1010]


Since running the BERT model can be very costly in terms of running time, we decide to test its performance on the sentiment classification task, WITHOUT fine-tuning it, by using three artificial sentences whose polarities are relatively obvious (to a human) and span all the possible polarity categories (positive, negative, and neutral).

In [24]:
# Create the two artifical sentences
pred_sentences = ['The performance of that company was incredible, the revenue will be increasing constantly over the next years',
                  'Their financial stability has been questioned multiple times, they are on the edge of bankruptcy',
                  'The market will probably be unaffected by these events']

In [8]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
print(tf_predictions)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": \n", labels[label[i]])

tf.Tensor(
[[0.39626688 0.60373306]
 [0.4094616  0.59053844]], shape=(2, 2), dtype=float32)
This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good : 
 Positive
One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie : 
 Positive


In [9]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /home/nltk_data...


True

In [34]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader_sentiment_result(sent):
    scores = analyzer.polarity_scores(sent)
    if scores["neg"] > scores["pos"] and scores["neg"] > scores["pos"]:
        return 'negative'
    elif scores["pos"] > scores["neg"] and scores["pos"] > scores["neu"]:
        return 'positive'
    return 'neutral'

# labels = vader_sentiment_result(pred_sentences)

# for i in range(1000):
#     print(pred_sentences[i], ": \n", vader_sentiment_result(pred_sentences[i]))
# train_set["vader_result"] = train_set["review"].apply(lambda x: vader_sentiment_result(x))
# valid_set["vader_result"] = valid_set["review"].apply(lambda x: vader_sentiment_result(x))

In [14]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")

the_labels = ["positive", "negative", "neutral"]
a_review = pred_sentences[1]

res = classifier(a_review, the_labels)

No model was supplied, defaulted to roberta-large-mnli (https://huggingface.co/roberta-large-mnli)


Downloading:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/878k [00:02<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [15]:
print(res)


{'sequence': 'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie', 'labels': ['negative', 'positive'], 'scores': [0.9947044253349304, 0.005295595154166222]}


In [35]:
hypothesis_template = "The sentiment of this quote is {}."
the_labels = ["positive", "negative", "neutral"]

vader_sent = []
one_shot_sent = []

for idx, item in tqdm(df.iterrows()):
    vader_sent.append(vader_sentiment_result(item['quotation']))
    one_shot_sent.append(classifier(item['quotation'], the_labels, hypothesis_template = hypothesis_template, multi_label=True)['labels'][0])

df['vader sentiment'] = vader_sent
df['one-shot sentiment'] = one_shot_sent

1000it [2:43:14,  9.79s/it]


In [36]:
df.head()


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,urls,tokenized,cosine_similarity,vader sentiment,one-shot sentiment
0,2015-03-26-025269,"However, due to sharp decline in KG-D6 gas pro...",Piyush Goyal,[Q7199798],2015-03-26 10:02:46,1,[http://timesofindia.indiatimes.com/business/i...,howev due sharp declin kgd6 ga product could g...,0.251782,neutral,negative
1,2015-10-28-001053,[ Young savers' ] parents and grandparents are...,Patrick Connolly,[Q7146267],2015-10-28 07:26:15,1,[http://gulfnews.com/business/sectors/features...,young saver parent grandpar use higher interes...,0.0,neutral,positive
2,2015-02-19-025137,"However, we cannot share any specific details ...",David Kalisch,[Q26322384],2015-02-19 00:26:19,5,[http://www.smh.com.au/federal-politics/politi...,howev share specif detail stage we provid info...,0.170075,neutral,negative
3,2015-05-16-013190,"HUD did not drop the complaint, but insisted t...",Dennis Wheeler,[Q55219988],2015-05-16 21:23:57,1,[http://adn.com/article/20150514/anchorage-cha...,hud drop complaint insist still issu code\n,0.28624,neutral,negative
4,2015-02-13-000112,2014 was a milestone year for us on many front...,Mike Fries,[Q54861319],2015-02-13 13:01:44,1,[http://advanced-television.com/2015/02/13/lib...,2014 mileston year us mani front we increas pa...,0.280876,neutral,positive


In [37]:
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
df.to_csv('out.zip', index=False,
          compression=compression_opts)  