# Sentiment classifier
### Loading in libaries:

In [19]:
import pandas as pd
import os, random, glob, json
from sentence_transformers import SentenceTransformer
import nltk
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

### Loading in the MPQA files:

In [2]:
files = glob.glob('en/en/mpqa/tab_feature_files/*.feat')
empty_list = []
final_list = []
header_names = ["sentence_id", "token_id", "token", "lemma", "pos", "term_id", "pol/mod", "poldomain", "aspect_training", "entity", "property", "phrase_type", "sentiment" , ]

for file in files:
    #print(file)
    frame= pd.read_csv(file,sep="\t", header=None, names=header_names)
    list_of_tokens = []
    list_of_tokens.append(frame['token'].tolist())
    

    for a in list_of_tokens:
        #string_tokens = str(list_of_tokens)
        string_per_file = " ".join(a)
        #print(string_per_file)
        break
        

    list_of_sentiment = []
    list_of_sentiment.append(frame['sentiment'].tolist())
    #print(list_of_sentiment)

    score = 0
    for a in list_of_sentiment:
        for b in a:
            if b == "B-negative":
                score += -1
            if b == "B-positive":
                score += 1
    #print(score)
    sentiment = ""
    if score > 0:
        sentiment = "positive"
    elif score == 0:
        sentiment = "neutral"
    else:
        sentiment = "negative"
    #print(sentiment)
    sentiment_dict = dict()
    sentiment_dict[string_per_file] = sentiment
    empty_list.append(string_per_file)
    
    #print(sentiment_dict)
    final_list.append(sentiment_dict)
#print(final_list)    
    #turn into dataframe

print(len(files))

506


### Put the data in a dataframe:

In [3]:
final_list[0]
text_list = []
label_list = []
for d in final_list:
    for key, value in d.items():
        text_list.append(key)
        label_list.append(value)
        
data_text={'text':text_list, 'label': label_list}
df=pd.DataFrame(data=data_text)
df

Unnamed: 0,text,label
0,the Kimberley Provincial Hospital say it would...,negative
1,Beijing have complete the afforestation of @ca...,positive
2,Russian guard seize @card@ kg of heroin on Taj...,neutral
3,"Istanbul , Dec @card@ ( a.a ) - Rauf Denktas ,...",positive
4,"Tehran , Dec @card@ IRNA -- President Muhammad...",neutral
5,the Sulaymaniyah-based telecommunication compa...,neutral
6,Jakarta ( Agency ) : Indonesia will not follow...,neutral
7,"Tokyo , Dec. @card@ ( Yonhap ) -- Japanese def...",neutral
8,France 's Chirac talk to Pakistan 's Musharraf...,negative
9,"the Argentine TV station Todo Noticias ["" all ...",negative


### Embed the text as BERT embeddings:

In [4]:
model = SentenceTransformer('roberta-large-nli-mean-tokens')
texts = df.text.to_list()

embedded_texts=[]
for text in texts:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
        sentence_embeddings = model.encode(sentences[:6])
    embedded_texts.append(sentence_embeddings)

I0325 08:39:25.686940  8196 SentenceTransformer.py:29] Load pretrained SentenceTransformer: roberta-large-nli-mean-tokens
I0325 08:39:25.686940  8196 SentenceTransformer.py:32] Did not find a / or \ in the name. Assume to download model from server
I0325 08:39:25.686940  8196 SentenceTransformer.py:68] Load SentenceTransformer from folder: C:\Users\ApotheekStiens/.cache\torch\sentence_transformers\public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_roberta-large-nli-mean-tokens.zip
I0325 08:39:25.771560  8196 configuration_utils.py:182] loading configuration file C:\Users\ApotheekStiens/.cache\torch\sentence_transformers\public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_roberta-large-nli-mean-tokens.zip\0_RoBERTa\config.json
I0325 08:39:25.771560  8196 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2l

Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.43s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.86s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:05<00:00,  5.23s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.70s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.88s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.92s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.13s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  4.00s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.35s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.11s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.17s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.12s/it]
Batches: 100%|██████████████████████████

Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.27s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.29s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.37s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.23s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.62s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.55s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.22s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.62s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.63s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.09s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.39s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.73s/it]
Batches: 100%|██████████████████████████

### Aggregate the embeddings:

In [5]:
# use concatenated list or mean list in the train_test_split function below

concatenated_list = []
mean_list=[]

for six_embeds in embedded_texts:
    new = np.concatenate(six_embeds)
    concatenated_list.append(new)
    mean_embeds_per_text = np.mean(six_embeds, axis = 0)
    mean_list.append(mean_embeds_per_text)

### List the training labels:

In [6]:
training_labels = list(df.label)

### Loading in the test file, and embed and aggregate the text:

In [7]:
dfs = pd.read_excel('./classify_only/sentiment_testset.xlsx', sheet_name='Sentiment SDG1')
dfs['text']=dfs['Headline'].astype(str)+' '+dfs['First 5 sentences']
dfs = dfs[['text', 'Label']]
test_text = dfs['text'].to_list()
test_y = dfs['Label'].to_list()

embedded_test = []
for text in test_text:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
            sentence_embeddings = model.encode(sentences[:6])
    embedded_test.append(sentence_embeddings)

BERT_test_X = []

for six_embeds in embedded_test:
    new = np.concatenate(six_embeds)
    BERT_test_X.append(new)

Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.02s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.14s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.29s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.18s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.79s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.66s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:05<00:00,  5.40s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:06<00:00,  6.54s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:07<00:00,  7.86s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.09s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.96s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.02s/it]
Batches: 100%|██████████████████████████

### Or split the training data in a test and training set:

In [15]:
#from sklearn.model_selection import train_test_split
# mean_list can be exchanged for concatenated_list
#X_train, X_test, y_train, y_test = train_test_split(concatenated_list, training_labels, test_size=0.33, random_state=42)

### Train different classifiers; Random Forest, Naive Bayes, Support Vector Machine and 3 variantions of Multi-layer perceptron:

In [26]:
rfc = RandomForestClassifier(n_estimators=10)
predictionsRF=rfc.fit(concatenated_list, training_labels).predict(BERT_test_X)
confRF=np.round(abs( rfc.predict_proba(BERT_test_X)),3).tolist()

gnb = GaussianNB()
predictionsNB = gnb.fit(concatenated_list, training_labels).predict(BERT_test_X)
confNB=np.round(abs( gnb.predict_proba(BERT_test_X)),3).tolist()

svm_var = LinearSVC(random_state=0, tol=1e-5, max_iter = 10000)
svm_var.fit(concatenated_list, training_labels)
predictionsSVM = svm_var.predict(BERT_test_X)
confSVM=np.round(abs( svm_var.decision_function(BERT_test_X)),3).tolist()

clf_sgd = MLPClassifier(solver='sgd', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1, max_iter=500)
clf_sgd.fit(concatenated_list, training_labels)
predictionsMLPsgd=clf_sgd.predict(BERT_test_X)
confMLPsgd=np.round(abs( clf_sgd.predict_proba(BERT_test_X)),3).tolist()

clf_adam = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1, max_iter=500)
clf_adam.fit(concatenated_list, training_labels)
predictionsMLPadam=clf_adam.predict(BERT_test_X)
confMLPadam=np.round(abs( clf_adam.predict_proba(BERT_test_X)),3).tolist()

clf_lbfgs = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1, max_iter=500)
clf_lbfgs.fit(concatenated_list, training_labels)
predictionsMLPlbfgs=clf_lbfgs.predict(BERT_test_X)
confMLPlbfgs=np.round(abs( clf_lbfgs.predict_proba(BERT_test_X)),3).tolist()

### Print the different classification reports:

In [27]:
print("Naive Bayes (Gaussian):")
print(classification_report(test_y, predictionsNB))
print("Random Forest:")
print(classification_report(test_y, predictionsRF))
print("Support Vector Machine:")
print(classification_report(test_y, predictionsSVM))
print("Multi layer perceptron (stochastic gradient descent):")
print(classification_report(test_y, predictionsMLPsgd))
print("Multi layer perceptron (adam):")
print(classification_report(test_y, predictionsMLPadam))
print("Multi layer perceptron (lbfgs):")
print(classification_report(test_y, predictionsMLPlbfgs))



Naive Bayes (Gaussian):


  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

    negative       1.00      0.57      0.72        23
     neutral       0.00      0.00      0.00         0
    positive       0.68      0.76      0.72        17

   micro avg       0.65      0.65      0.65        40
   macro avg       0.56      0.44      0.48        40
weighted avg       0.87      0.65      0.72        40

Random Forest:
              precision    recall  f1-score   support

    negative       0.74      0.61      0.67        23
     neutral       0.00      0.00      0.00         0
    positive       0.73      0.47      0.57        17

   micro avg       0.55      0.55      0.55        40
   macro avg       0.49      0.36      0.41        40
weighted avg       0.73      0.55      0.63        40

Support Vector Machine:
              precision    recall  f1-score   support

    negative       0.85      0.74      0.79        23
     neutral       0.00      0.00      0.00         0
    positive       0.82      0.82   

### Preprocess the texts for the baseline:

In [20]:
BOW_texts_list = []

for text in texts:
    sentences = nltk.sent_tokenize(text)
    
    new_text =[]
    if len(sentences) >=6:
        for sent in sentences [:6]:
            sent = sent.lower()
            sent = re.sub(r'\W',' ',sent)
            sent = re.sub(r'\s+',' ',sent)
            new_text.append(sent)
            
    else:
        for sent in sentences:
            sent = sent.lower()
            sent = re.sub(r'\W',' ',sent)
            sent = re.sub(r'\s+',' ',sent)
            new_text.append(sent)
    BOW_texts_list.append(new_text)

# each text is a single string after this ( i feel this is an inefficient way to do this, but it works)
concatenated_sents = [''.join(item)for item in BOW_texts_list]

###  Create the model:

In [21]:
count_vec = CountVectorizer(stop_words=stopwords.words('english')) 
# this is the model
BOW_model = count_vec.fit_transform(concatenated_sents)

### Create and run the classifier and print the result:

In [22]:
test_X2 = count_vec.transform(test_text)
BOW_classifier = LinearSVC(random_state=0, tol=1e-5)
BOW_classifier.fit(BOW_model,training_labels)
predicted_label = BOW_classifier.predict(test_X2)
print(classification_report(test_y2, predicted_label))
confBOW=np.round(abs( BOW_classifier.decision_function(test_X)),3).tolist()

              precision    recall  f1-score   support

    negative       0.43      0.26      0.32        23
     neutral       0.00      0.00      0.00         0
    positive       0.50      0.47      0.48        17

   micro avg       0.35      0.35      0.35        40
   macro avg       0.31      0.24      0.27        40
weighted avg       0.46      0.35      0.39        40



### Print the results in a tsv for error analysis:

In [24]:
dfs['SVM_predictions']=predictionsSVM
dfs['SVM_confidence'] = confSVM
dfs['NN_predictions'] = predictionsMLPsgd
dfs['NN_confidence'] = confMLPsgd
dfs['Baseline_predictions']=predicted_label
dfs.to_csv('results.tsv', sep='\t')

### Load in the second test set and create and concatonate the BERT embeddings:

In [31]:
dfs2 = pd.read_excel('SDG_12_articles.xlsx', sheet_name='Sentiment SDG 12')
dfs2 = dfs2[['text', 'sentiment']]
test_text = dfs2['text'].to_list()
test_y2 = dfs2['sentiment'].to_list()

embedded_test = []
for text in test_text:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
            sentence_embeddings = model.encode(sentences[:6])
    embedded_test.append(sentence_embeddings)

BERT_test_X2 = []

for six_embeds in embedded_test:
    new = np.concatenate(six_embeds)
    BERT_test_X2.append(new)

Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.80s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.38s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.56s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.77s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.51s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:01<00:00,  1.96s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.86s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:02<00:00,  2.88s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:03<00:00,  3.49s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.20s/it]
Batches: 100%|███████████████████████████████████| 1/1 [00:04<00:00,  4.72s/it]
Batches: 100%|██████████████████████████

### Run the earlier trained classifiers on the new test set:

In [32]:
predictionsRF=rfc.predict(BERT_test_X2)
confRF=np.round(abs( rfc.predict_proba(BERT_test_X2)),3).tolist()

predictionsNB = gnb.predict(BERT_test_X2)
confNB=np.round(abs( gnb.predict_proba(BERT_test_X2)),3).tolist()

predictionsSVM = svm_var.predict(BERT_test_X2)
confSVM=np.round(abs( svm_var.decision_function(BERT_test_X2)),3).tolist()


predictionsMLPsgd=clf_sgd.predict(BERT_test_X2)
confMLPsgd=np.round(abs( clf_sgd.predict_proba(BERT_test_X2)),3).tolist()

predictionsMLPadam=clf_adam.predict(BERT_test_X2)
confMLPadam=np.round(abs( clf_adam.predict_proba(BERT_test_X2)),3).tolist()

predictionsMLPlbfgs=clf_lbfgs.predict(BERT_test_X2)
confMLPlbfgs=np.round(abs( clf_lbfgs.predict_proba(BERT_test_X2)),3).tolist()

### Print the results:

In [33]:
print("Naive Bayes (Gaussian):")
print(classification_report(test_y2, predictionsNB))
print("Random Forest:")
print(classification_report(test_y2, predictionsRF))
print("Support Vector Machine:")
print(classification_report(test_y2, predictionsSVM))
print("Multi layer perceptron (stochastic gradient descent):")
print(classification_report(test_y2, predictionsMLPsgd))
print("Multi layer perceptron (adam):")
print(classification_report(test_y2, predictionsMLPadam))
print("Multi layer perceptron (lbfgs):")
print(classification_report(test_y2, predictionsMLPlbfgs))

Naive Bayes (Gaussian):


  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

    negative       0.93      0.67      0.78        21
     neutral       0.00      0.00      0.00         0
    positive       0.94      0.74      0.83        39

   micro avg       0.72      0.72      0.72        60
   macro avg       0.62      0.47      0.54        60
weighted avg       0.93      0.72      0.81        60

Random Forest:
              precision    recall  f1-score   support

    negative       0.43      0.48      0.45        21
     neutral       0.00      0.00      0.00         0
    positive       0.83      0.51      0.63        39

   micro avg       0.50      0.50      0.50        60
   macro avg       0.42      0.33      0.36        60
weighted avg       0.69      0.50      0.57        60

Support Vector Machine:
              precision    recall  f1-score   support

    negative       0.52      0.52      0.52        21
     neutral       0.00      0.00      0.00         0
    positive       0.82      0.59   

### Run the baseline system on the second testset:

In [34]:
test_X2 = count_vec.transform(test_text)
BOW_classifier = LinearSVC(random_state=0, tol=1e-5)
BOW_classifier.fit(BOW_model,training_labels)
predicted_label = BOW_classifier.predict(test_X2)
print(classification_report(test_y2, predicted_label))
confBOW=np.round(abs( BOW_classifier.decision_function(test_X)),3).tolist()

              precision    recall  f1-score   support

    negative       0.52      0.52      0.52        21
     neutral       0.00      0.00      0.00         0
    positive       0.81      0.54      0.65        39

   micro avg       0.53      0.53      0.53        60
   macro avg       0.44      0.35      0.39        60
weighted avg       0.71      0.53      0.60        60

