# Thesis 2020-2021: DeepMoji Model 

In this notebook, we will create a deepmoji model.

In [1]:
import pandas as pd
import numpy as np
import math

import matplotlib
import matplotlib.pyplot as plt

In [2]:
import csv
    
df_train = pd.read_csv('data_csv/hateval2019_en_train.csv')#, encoding='cp1252')
df_dev = pd.read_csv('data_csv/hateval2019_en_dev.csv')#, encoding='cp1252')

df_train_dev = df_train.append(df_dev, ignore_index=True)
df_train_dev = df_train_dev.drop(['TR', 'AG'], axis=1)

df_test = pd.read_csv('data_csv/hateval2019_en_test.csv')
df_test = df_test.drop(['TR', 'AG'], axis=1)

In [3]:
df_train_dev

Unnamed: 0,id,text,HS
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1
1,202,Why would young fighting age men be the vast m...,1
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0
4,205,Orban in Brussels: European leaders are ignori...,0
...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0
9998,19199,I hate bitches who talk about niggaz with kids...,1


In [28]:
import re
from pattern.text.en import singularize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower() # Convert everything to lower case
    text = re.sub(r'@[a-zA-Z0-9]+', '', text) # Remove @mentions
    text = re.sub(r'rt[\s]+', '', text) # Remove RT (retweet symbol)
    text = re.sub(r'&amp;', 'and', text) # Replace '&amp;' by 'and'
    text = re.sub(r'https?:\/\/\S+', '', text) # Remove hyper link  
    #text = re.sub(r'\d+', '0', text) # Replace all numbers by a zero
    text = " ".join([singularize(word) for word in tokenizer.tokenize(text) if word not in stop_words]) # Remove stopwords
    #text = " ".join([singularize(word) for word in text])
    text = re.sub(r'[^\w\s#]', ' ', text) # Remove all non-alphanumeric symbols (excluding whitespace and # characters)
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces by a single whitespace
    text = text.strip() # Remove whitespaces at the beginning and at the end
    
    return text

In [31]:
# Load (2304 long) deepmoji vectors (for first 5000 samples of training data) from csv file into variable encoding_part1

import pandas as pd

# write csv file
#df = pd.DataFrame(encoding)
#df.to_csv('encoding_part1.csv')

# read csv file
encoding_part1 = pd.read_csv("encoding_part1.csv")
del encoding_part1["Unnamed: 0"]
encoding_part1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
0,0.0,-0.06376,-0.002055,0.0,0.00305,-0.005823,-0.000113,0.0,-0.001246,0.100377,...,0.050156,-0.014659,-0.021302,-0.014753,0.072366,-0.036926,-0.024684,-0.004294,0.003646,0.00124
1,-0.046231,0.000309,-0.002589,0.0,-0.058986,-0.001589,0.001765,-6.5e-05,-0.000862,0.001762,...,-0.007312,0.043984,0.060464,-0.040505,0.009673,-0.027391,-0.006929,0.027512,-0.026444,0.008858
2,-0.010508,-0.058681,-0.004275,0.0,-0.152366,-0.003675,0.000355,0.0,0.001535,0.008206,...,0.007031,0.011392,0.007648,-0.01465,0.026785,-0.033066,-0.005839,0.015446,-0.013252,0.005346
3,-0.074244,-0.009066,-0.006454,0.0,-0.012025,-0.030088,-1.3e-05,-0.000315,0.004916,0.010824,...,0.035436,0.025413,0.032776,0.036215,0.005092,0.008667,-0.023761,-0.001856,0.009813,-0.026586
4,-0.028176,-0.045293,-0.002111,0.0,-0.091202,-0.004682,9e-06,-0.000201,-0.008982,0.159621,...,0.052942,0.009692,0.016195,0.028719,0.073249,-0.058089,-0.037047,0.022545,-0.009876,-0.031878


In [45]:
encoding_part1.shape

(5000, 2304)

In [18]:
# Now fix (2304 long) deepmoji vectors for second 5000 samples of training data

""" Use torchMoji to encode texts into emotional feature vectors.
"""
from __future__ import print_function, division, unicode_literals
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_feature_encoding 
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized_second, _, _ = st.tokenize_sentences(df_train_dev.text[5000:])
#print("TOKENIZED:",tokenized)
print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

print('Encoding texts..')
encoding_second = model(tokenized_second)

Tokenizing using dictionary from C:\Users\Admin\Desktop\Master thesis\Thesis_2021\deepmoji/model/vocabulary.json
Loading model from C:\Users\Admin\Desktop\Master thesis\Thesis_2021\deepmoji/model/pytorch_model.bin.
Loading weights for embed.weight
Loading weights for lstm_0.weight_ih_l0
Loading weights for lstm_0.weight_hh_l0
Loading weights for lstm_0.bias_ih_l0
Loading weights for lstm_0.bias_hh_l0
Loading weights for lstm_0.weight_ih_l0_reverse
Loading weights for lstm_0.weight_hh_l0_reverse
Loading weights for lstm_0.bias_ih_l0_reverse
Loading weights for lstm_0.bias_hh_l0_reverse
Loading weights for lstm_1.weight_ih_l0
Loading weights for lstm_1.weight_hh_l0
Loading weights for lstm_1.bias_ih_l0
Loading weights for lstm_1.bias_hh_l0
Loading weights for lstm_1.weight_ih_l0_reverse
Loading weights for lstm_1.weight_hh_l0_reverse
Loading weights for lstm_1.bias_ih_l0_reverse
Loading weights for lstm_1.bias_hh_l0_reverse
Loading weights for attention_layer.attention_vector
Ignoring we

In [36]:
encoding_part1.shape

(5000, 2304)

In [54]:
encoding_part2 = pd.DataFrame(encoding_second)
encoding_part2.columns = encoding_part1.columns
encoding_total = encoding_part1.append(encoding_part2)
print("Size:", encoding_total.shape)
encoding_total.head()

Size: (10000, 2304)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
0,0.0,-0.06376,-0.002055,0.0,0.00305,-0.005823,-0.000113,0.0,-0.001246,0.100377,...,0.050156,-0.014659,-0.021302,-0.014753,0.072366,-0.036926,-0.024684,-0.004294,0.003646,0.00124
1,-0.046231,0.000309,-0.002589,0.0,-0.058986,-0.001589,0.001765,-6.5e-05,-0.000862,0.001762,...,-0.007312,0.043984,0.060464,-0.040505,0.009673,-0.027391,-0.006929,0.027512,-0.026444,0.008858
2,-0.010508,-0.058681,-0.004275,0.0,-0.152366,-0.003675,0.000355,0.0,0.001535,0.008206,...,0.007031,0.011392,0.007648,-0.01465,0.026785,-0.033066,-0.005839,0.015446,-0.013252,0.005346
3,-0.074244,-0.009066,-0.006454,0.0,-0.012025,-0.030088,-1.3e-05,-0.000315,0.004916,0.010824,...,0.035436,0.025413,0.032776,0.036215,0.005092,0.008667,-0.023761,-0.001856,0.009813,-0.026586
4,-0.028176,-0.045293,-0.002111,0.0,-0.091202,-0.004682,9e-06,-0.000201,-0.008982,0.159621,...,0.052942,0.009692,0.016195,0.028719,0.073249,-0.058089,-0.037047,0.022545,-0.009876,-0.031878


In [55]:
""" Use torchMoji to encode test samples into emotional feature vectors.
"""

with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized_test, _, _ = st.tokenize_sentences(df_test.text)
model_test = torchmoji_feature_encoding(PRETRAINED_PATH)
encoding_test = model_test(tokenized_test)

Loading weights for embed.weight
Loading weights for lstm_0.weight_ih_l0
Loading weights for lstm_0.weight_hh_l0
Loading weights for lstm_0.bias_ih_l0
Loading weights for lstm_0.bias_hh_l0
Loading weights for lstm_0.weight_ih_l0_reverse
Loading weights for lstm_0.weight_hh_l0_reverse
Loading weights for lstm_0.bias_ih_l0_reverse
Loading weights for lstm_0.bias_hh_l0_reverse
Loading weights for lstm_1.weight_ih_l0
Loading weights for lstm_1.weight_hh_l0
Loading weights for lstm_1.bias_ih_l0
Loading weights for lstm_1.bias_hh_l0
Loading weights for lstm_1.weight_ih_l0_reverse
Loading weights for lstm_1.weight_hh_l0_reverse
Loading weights for lstm_1.bias_ih_l0_reverse
Loading weights for lstm_1.bias_hh_l0_reverse
Loading weights for attention_layer.attention_vector
Ignoring weights for output_layer.0.weight
Ignoring weights for output_layer.0.bias




In [56]:
X_train = encoding_total
X_test = encoding_test
y_train = df_train_dev.HS
y_test = df_test.HS

In [59]:
# Evaluate the DeepMoji model using Logistic Regression as the classifier (without normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(X_train, y_train)
y_predict = logreg.predict(X_test)

df_test_deepm = df_test.copy()
df_test_deepm['HS'] = y_predict

# Create prediction file for the deepmoji_baseline
df_test_deepm[['id', 'HS']].to_csv('predictions/deepmoji_baseline.tsv', sep='\t', index=False, header=False)
df_test_deepm[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the deepmoji_baseline
evaluate.write_eval("scores_deepmoji_baseline")

importing Jupyter notebook from evaluate.ipynb
taskA_fscore: 0.526420692167044
taskA_precision: 0.6063356553342925
taskA_recall: 0.581992337164751
taskA_accuracy: 0.54


In [62]:
from sklearn.metrics import f1_score

# Normalize the data via StandardScaler

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Evaluate the DeepMoji model using Logistic Regression as the classifier (+ normalizing input data)

logreg_scaled = LogisticRegression().fit(X_train_scaled, y_train)
y_predict_scaled = logreg_scaled.predict(X_test_scaled)

df_test_deepmoji_scaled = df_test.copy()
df_test_deepmoji_scaled['HS'] = y_predict_scaled

# Create prediction file for the deepmoji_scaled
df_test_deepmoji_scaled[['id', 'HS']].to_csv('predictions/deepmoji_scaled.tsv', sep='\t', index=False, header=False)
df_test_deepmoji_scaled[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the deepmoji_scaled
evaluate.write_eval("scores_deepmoji_scaled")

taskA_fscore: 0.5411640211640212
taskA_precision: 0.597527747726089
taskA_recall: 0.583264915161467
taskA_accuracy: 0.5483333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

f1 = make_scorer(f1_score , average='macro')
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ('newton-cg', 'lbfgs', 'liblinear'), 'penalty': ('l1', 'l2', 'elasticnet')}
grid = GridSearchCV(estimator=LogisticRegression(max_iter=500), param_grid=params, cv=5, scoring=f1, verbose=5, n_jobs=5)
grid.fit(X_train_scaled, y_train)
print("Best cross-validation score: ", grid.best_score_)
print("Best parameters: ", grid.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    5.0s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   57.8s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 270 out of 270 | elapsed: 10.9min finished


Best cross-validation score:  0.693147302500796
Best parameters:  {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [72]:
from sklearn.metrics import f1_score

# Evaluate the DeepMoji model using optimized Logistic Regression as the classifier (+ normalizing input data)

logreg_optimized = LogisticRegression(solver='liblinear').fit(X_train_scaled, y_train)
y_predict_optimized = logreg_optimized.predict(X_test_scaled)

df_test_deepmoji_optimized = df_test.copy()
df_test_deepmoji_optimized['HS'] = y_predict_optimized

# Create prediction file for the deepmoji_optimized
df_test_deepmoji_optimized[['id', 'HS']].to_csv('predictions/deepmoji_optimized.tsv', sep='\t', index=False, header=False)
df_test_deepmoji_optimized[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the deepmoji_optimized
evaluate.write_eval("scores_deepmoji_optimized")

taskA_fscore: 0.545005086132736
taskA_precision: 0.5999845737177499
taskA_recall: 0.5860290093048715
taskA_accuracy: 0.5516666666666666


In [76]:
# Evaluate using Multinomial Naive Bayes as the classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()
X_train_nb = mm_scaler.fit_transform(X_train_scaled)
X_test_nb = mm_scaler.transform(X_test_scaled)

nb = MultinomialNB().fit(X_train_nb, y_train)
y_predict_nb = nb.predict(X_test_nb)

# Create new test dataframe
df_test_deepmoji_nb = df_test.copy()
df_test_deepmoji_nb['HS'] = y_predict_nb

# Create prediction file for the deepmoji_nb
df_test_deepmoji_nb[['id', 'HS']].to_csv('predictions/deepmoji_nb.tsv', sep='\t', index=False, header=False)
df_test_deepmoji_nb[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the deepmoji_nb
evaluate.write_eval("scores_deepmoji_nb")

taskA_fscore: 0.5368626086956522
taskA_precision: 0.5440589688278884
taskA_recall: 0.5449507389162562
taskA_accuracy: 0.5376666666666666


In [77]:
# Evaluate using Random Forest as the classifier

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train_scaled, y_train)
y_predict_rf = rf.predict(X_test_scaled)

# Create new test dataframe
df_test_deepmoji_rf = df_test.copy()
df_test_deepmoji_rf['HS'] = y_predict_rf

# Create prediction file for the deepmoji_rf
df_test_deepmoji_rf[['id', 'HS']].to_csv('predictions/deepmoji_rf.tsv', sep='\t', index=False, header=False)
df_test_deepmoji_rf[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the deepmoji_rf
evaluate.write_eval("scores_deepmoji_rf")

taskA_fscore: 0.5220857965880091
taskA_precision: 0.5917577968986676
taskA_recall: 0.5732484948002189
taskA_accuracy: 0.5336666666666666


## Now we will repeat the same things, but we will clean the data first

In [80]:
# Clean the text
df_train_dev['text_cleaned'] = df_train_dev['text'].apply(cleanTxt)
df_test['text_cleaned'] = df_test['text'].apply(cleanTxt)

In [82]:
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,hurray saving u many way #lockthemup #buildthe...
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time s nearly white s state pose s array pr...
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel european leader ignoring person ...
...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed fuck pussy
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch go make satanic music u illuminatus...
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,honey fellow white chick let tell need shut fu...
9998,19199,I hate bitches who talk about niggaz with kids...,1,hate bitch talk niggaz kid everybody cant find...
