# Thesis 2020-2021: Word2Vec (word embeddings)

In this notebook, we will create a Word2Vec model.

In [2]:
import pandas as pd
import numpy as np
import math

import matplotlib
import matplotlib.pyplot as plt

from gensim.models import Word2Vec

In [3]:
import re
from pattern.text.en import singularize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower() # Convert everything to lower case
    text = re.sub(r'@[a-zA-Z0-9]+', '', text) # Remove @mentions
    text = re.sub(r'rt[\s]+', '', text) # Remove RT (retweet symbol)
    text = re.sub(r'&amp;', 'and', text) # Replace '&amp;' by 'and'
    text = re.sub(r'https?:\/\/\S+', '', text) # Remove hyper link  
    #text = re.sub(r'\d+', '0', text) # Replace all numbers by a zero
    text = " ".join([singularize(word) for word in tokenizer.tokenize(text) if word not in stop_words]) # Remove stopwords
    #text = " ".join([singularize(word) for word in text])
    text = re.sub(r'[^\w\s#]', ' ', text) # Remove all non-alphanumeric symbols (excluding whitespace and # characters)
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces by a single whitespace
    text = text.strip() # Remove whitespaces at the beginning and at the end
    
    return text

In [4]:
import csv
    
df_train = pd.read_csv('data/hateval2019_en_train.csv')
df_dev = pd.read_csv('data/hateval2019_en_dev.csv')

df_train_dev = df_train.append(df_dev, ignore_index=True)
df_train_dev = df_train_dev.drop(['TR', 'AG'], axis=1)

df_test = pd.read_csv('data/hateval2019_en_test.csv')
df_test = df_test.drop(['TR', 'AG'], axis=1)

# Clean the data

df_train_dev['text_cleaned'] = df_train_dev['text'].apply(cleanTxt)
df_test['text_cleaned'] = df_test['text'].apply(cleanTxt)
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,hurray saving u many way #lockthemup #buildthe...
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time s nearly white s state pose s array pr...
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel european leader ignoring person ...
...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed fuck pussy
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch go make satanic music u illuminatus...
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,honey fellow white chick let tell need shut fu...
9998,19199,I hate bitches who talk about niggaz with kids...,1,hate bitch talk niggaz kid everybody cant find...


In [9]:
X_train = df_train_dev.text
X_test = df_test.text
y_train = df_train_dev.HS
y_test = df_test.HS

In [10]:
train_sentences = [tokenizer.tokenize(s.lower()) for s in X_train]
test_sentences = [tokenizer.tokenize(s.lower()) for s in X_test]
train_sentences

[['Hurray',
  ',',
  'saving',
  'us',
  '$',
  '$',
  '$',
  'in',
  'so',
  'many',
  'ways',
  '@potus',
  '@realDonaldTrump',
  '#LockThemUp',
  '#BuildTheWall',
  '#EndDACA',
  '#BoycottNFL',
  '#BoycottNike'],
 ['Why',
  'would',
  'young',
  'fighting',
  'age',
  'men',
  'be',
  'the',
  'vast',
  'majority',
  'of',
  'the',
  'ones',
  'escaping',
  'a',
  'war',
  '&',
  'not',
  'those',
  'who',
  'cannot',
  'fight',
  'like',
  'women',
  ',',
  'children',
  ',',
  'and',
  'the',
  'elderly',
  '?',
  "It's",
  'because',
  'the',
  'majority',
  'of',
  'the',
  'refugees',
  'are',
  'not',
  'actually',
  'refugees',
  'they',
  'are',
  'economic',
  'migrants',
  'trying',
  'to',
  'get',
  'into',
  'Europe',
  '...',
  'https://t.co/Ks0SHbtYqn'],
 ['@KamalaHarris',
  'Illegals',
  'Dump',
  'their',
  'Kids',
  'at',
  'the',
  'border',
  'like',
  'Road',
  'Kill',
  'and',
  'Refuse',
  'to',
  'Unite',
  '!',
  'They',
  'Hope',
  'they',
  'get',
  'Amnes

In [1]:
train_sentences

NameError: name 'train_sentences' is not defined

In [11]:
model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)
model.wv.index_to_key
#vector = model.wv['saving']  # get numpy vector of a word
#sims = model.wv.most_similar('saving', topn=10)  # get other similar words


['.',
 'the',
 'to',
 ',',
 'a',
 'and',
 'you',
 'of',
 'in',
 '!',
 'is',
 'for',
 'I',
 'are',
 '?',
 'that',
 "'",
 'on',
 'not',
 '-',
 'it',
 '’',
 'with',
 ':',
 'your',
 'be',
 'bitch',
 '...',
 'all',
 '"',
 'have',
 'they',
 'refugees',
 'this',
 'women',
 '&',
 'from',
 'like',
 'who',
 'my',
 'The',
 'me',
 'their',
 'about',
 'by',
 'up',
 'at',
 'will',
 'as',
 'we',
 'immigrant',
 'but',
 'or',
 'them',
 'migrants',
 'just',
 'woman',
 'can',
 'get',
 'You',
 'our',
 'an',
 'was',
 'so',
 'do',
 'people',
 'men',
 'her',
 'rape',
 'out',
 'no',
 'if',
 's',
 'cunt',
 '/',
 'what',
 'more',
 'has',
 'go',
 '#BuildThatWall',
 'want',
 'When',
 '…',
 'when',
 'illegal',
 'immigration',
 'one',
 'hysterical',
 'how',
 'whore',
 '“',
 'know',
 'fuck',
 "don't",
 ')',
 'This',
 'ass',
 '”',
 'via',
 'fucking',
 'should',
 'immigrants',
 'If',
 'A',
 't',
 'back',
 '(',
 'he',
 'u',
 'she',
 '@realDonaldTrump',
 'U',
 'country',
 'We',
 'being',
 'now',
 'would',
 'hoe',
 'refu

In [76]:
vector = model.wv['woman']  # get numpy vector of a word
len(vector)

100

In [72]:
model.wv.most_similar('man', topn=10)  # get other similar words

[('woman', 0.9995855689048767),
 ('hysterical', 0.9995812177658081),
 ('say', 0.9995176792144775),
 ('one', 0.9994102716445923),
 ('thing', 0.9993219375610352),
 ('rape', 0.9993157386779785),
 ('think', 0.9993077516555786),
 ('white', 0.9992996454238892),
 ('said', 0.9992806315422058),
 ('assault', 0.9992477893829346)]

In [12]:
# Function that creates Word2Vec vectors for each sentences (by taking the average of the word2vecs of each word within sentence)

def create_word2vec_vectors(sentences):
    list_vectors = []
    for s in sentences:
        temp = np.zeros(100)
        if (len(s) == 0):
            list_vectors.append(list(temp))
            continue
        for word in s:
            if (word not in model.wv.index_to_key): continue # This condition is needed because there could be some unseen words within the test data
            temp = temp + model.wv[word]
        list_vectors.append(list(temp/len(s)))
    
    return list_vectors

In [13]:
train_vectors = create_word2vec_vectors(train_sentences)
print(np.shape(train_vectors))
test_vectors = create_word2vec_vectors(test_sentences)
print(np.shape(test_vectors))

(10000, 100)
(3000, 100)


In [106]:
# EXTRA: Dubble check if you calculated the vectors correctly

even_testen = model.wv[train_sentences[0][0]]
for w in train_sentences[0][1:]:
    even_testen = even_testen + model.wv[w]

(even_testen/len(train_sentences[0]))

array([-3.30899686e-01,  4.45880026e-01,  2.44529933e-01, -2.62317538e-01,
       -4.91707548e-02, -6.60078883e-01,  2.52522647e-01,  7.55166054e-01,
       -2.49226615e-02, -4.36568022e-01, -7.01646656e-02, -6.20993733e-01,
       -5.40604368e-02, -1.81502104e-01,  2.04280645e-01, -5.28267562e-01,
        1.32858917e-01, -6.20416760e-01,  1.33758217e-01, -6.45467162e-01,
        7.38661736e-02,  5.13638258e-02,  5.09726167e-01, -2.48283193e-01,
       -7.04475045e-01,  1.15412712e-01, -5.08204103e-01, -2.44777530e-01,
       -2.56729513e-01,  2.41511315e-01,  4.97765243e-01, -4.58144173e-02,
        1.16910435e-01, -1.21255159e-01,  6.57712296e-02,  5.36842942e-01,
       -8.95184278e-03, -3.08947057e-01, -2.84584582e-01, -6.42600954e-01,
        2.12115452e-01, -2.03791618e-01, -4.44506705e-02,  6.66731894e-02,
        5.39922476e-01, -8.10264498e-02,  1.34869786e-02,  1.00230351e-01,
        1.93458289e-01,  3.18110377e-01,  3.01467478e-01, -6.22003973e-02,
        6.80165440e-02, -

## Now we will evaluate the Word2Vec model using a LR model as classifier
- Evaluate without normalizing/scaling data
- Evaluate after normalizing/scaling data

In [14]:
# Evaluate the Word2Vec model using Logistic Regression as the classifier (without normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(train_vectors, y_train)
y_predict = logreg.predict(test_vectors)

# Create new test dataframe
df_test_word2vec = df_test.copy()
df_test_word2vec['HS'] = y_predict

# Create prediction file for the word2vec
df_test_word2vec[['id', 'HS']].to_csv('predictions/word2vec.tsv', sep='\t', index=False, header=False)
df_test_word2vec[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the word2vec
evaluate.write_eval("scores_word2vec")

importing Jupyter notebook from evaluate.ipynb
taskA_fscore: 0.5339701740911418
taskA_precision: 0.5464838899418338
taskA_recall: 0.5467159277504106
taskA_accuracy: 0.534


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Normalize the data via StandardScaler

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_vectors)
train_vectors_scaled = scaler.transform(train_vectors)
test_vectors_scaled = scaler.transform(test_vectors)

#clf_scaled = LogisticRegression().fit(X_train_scaled, y_train)
#y_scaled_predict = clf_scaled.predict(X_test_scaled)

In [16]:
# Evaluate the bigram model using Logistic Regression as the classifier (+ normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(train_vectors_scaled, y_train)
y_predict_scaled = logreg.predict(test_vectors_scaled)

# Create new test dataframe
df_test_word2vec_scaled = df_test.copy()
df_test_word2vec_scaled['HS'] = y_predict_scaled

# Create prediction file for the word2vec_scaled
df_test_word2vec_scaled[['id', 'HS']].to_csv('predictions/word2vec_scaled.tsv', sep='\t', index=False, header=False)
df_test_word2vec_scaled[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the word2vec_scaled
evaluate.write_eval("scores_word2vec_scaled")

taskA_fscore: 0.4795013320434051
taskA_precision: 0.523644710307094
taskA_recall: 0.5202791461412151
taskA_accuracy: 0.48733333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

f1 = make_scorer(f1_score , average='macro')
params = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ('newton-cg', 'lbfgs', 'liblinear'), 'penalty': ('l1', 'l2', 'elasticnet')}
grid = GridSearchCV(estimator=LogisticRegression(max_iter=500), param_grid=params, cv=5, scoring=f1, verbose=5, n_jobs=5)
grid.fit(train_vectors, y_train)
print("Best cross-validation score: ", grid.best_score_)
print("Best parameters: ", grid.best_params_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   37.7s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 10.7min
[Parallel(n_jobs=5)]: Done 225 out of 225 | elapsed: 26.5min finished


Best cross-validation score:  0.662681034149031
Best parameters:  {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


In [18]:
# Evaluate the Word2Vec model using Logistic Regression as the classifier (without normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=10, penalty='l1', solver='liblinear').fit(train_vectors, y_train)
y_predict = logreg.predict(test_vectors)

# Create new test dataframe
df_test_word2vec = df_test.copy()
df_test_word2vec['HS'] = y_predict

# Create prediction file for the word2vec
df_test_word2vec[['id', 'HS']].to_csv('predictions/word2vec.tsv', sep='\t', index=False, header=False)
df_test_word2vec[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the word2vec
evaluate.write_eval("scores_word2vec")

taskA_fscore: 0.47175108993309645
taskA_precision: 0.5279702994262985
taskA_recall: 0.522400109469075
taskA_accuracy: 0.48433333333333334




In [134]:
# Evaluate the Word2Vec model using optimized Logistic Regression as the classifier

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg_optimized = LogisticRegression(C=10, penalty='l1', solver='liblinear').fit(train_vectors, y_train)
y_predict_optimized = logreg_optimized.predict(test_vectors)

# Create new test dataframe
df_test_word2vec_optimized = df_test.copy()
df_test_word2vec_optimized['HS'] = y_predict_optimized

# Create prediction file for the word2vec_optimized
df_test_word2vec_optimized[['id', 'HS']].to_csv('predictions/word2vec_optimized.tsv', sep='\t', index=False, header=False)
df_test_word2vec_optimized[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the word2vec_optimized
evaluate.write_eval("scores_word2vec")

taskA_fscore: 0.45887837800087117
taskA_precision: 0.526378220548468
taskA_recall: 0.5195812807881773
taskA_accuracy: 0.477




## Doc2Vec

In [19]:
train_sentences = [tokenizer.tokenize(s.lower()) for s in X_train]
test_sentences = [tokenizer.tokenize(s.lower()) for s in X_test]
train_sentences

[['hurray',
  ',',
  'saving',
  'us',
  '$',
  '$',
  '$',
  'in',
  'so',
  'many',
  'ways',
  '@potus',
  '@realdonaldtrump',
  '#lockthemup',
  '#buildthewall',
  '#enddaca',
  '#boycottnfl',
  '#boycottnike'],
 ['why',
  'would',
  'young',
  'fighting',
  'age',
  'men',
  'be',
  'the',
  'vast',
  'majority',
  'of',
  'the',
  'ones',
  'escaping',
  'a',
  'war',
  '&',
  'not',
  'those',
  'who',
  'cannot',
  'fight',
  'like',
  'women',
  ',',
  'children',
  ',',
  'and',
  'the',
  'elderly',
  '?',
  "it's",
  'because',
  'the',
  'majority',
  'of',
  'the',
  'refugees',
  'are',
  'not',
  'actually',
  'refugees',
  'they',
  'are',
  'economic',
  'migrants',
  'trying',
  'to',
  'get',
  'into',
  'europe',
  '...',
  'https://t.co/ks0shbtyqn'],
 ['@kamalaharris',
  'illegals',
  'dump',
  'their',
  'kids',
  'at',
  'the',
  'border',
  'like',
  'road',
  'kill',
  'and',
  'refuse',
  'to',
  'unite',
  '!',
  'they',
  'hope',
  'they',
  'get',
  'amnes

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# Convert tokenized document into gensim formated tagged data
tagged_train = [TaggedDocument(d, [i]) for i, d in enumerate(train_sentences)]
tagged_test = [TaggedDocument(d, [i]) for i, d in enumerate(test_sentences)]
#tagged_data

In [35]:
len(tagged_train)

10000

In [22]:
## Train doc2vec model for train data
model = Doc2Vec(tagged_train, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model.save("doc2vec.model")
## Load saved doc2vec model
model= Doc2Vec.load("doc2vec.model")
## Print model vocabulary
model.wv.vocab

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [44]:
model.dv[9999]

array([ 0.8800385 ,  0.4792079 ,  0.33585912,  0.16713983,  0.22360726,
        0.6228587 ,  0.44309443, -0.85718954, -0.93812525, -0.26766086,
        0.226529  , -0.29723215,  0.96531343,  0.20062225,  1.4908057 ,
        0.8850264 ,  1.5426282 , -2.0138817 , -0.49324414, -0.23406762],
      dtype=float32)

In [46]:
## Train doc2vec model for test data
model_test = Doc2Vec(tagged_test, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model_test.save("doc2vec_test.model")
## Load saved doc2vec model
model_test= Doc2Vec.load("doc2vec_test.model")

In [58]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x245988f8fa0>

In [60]:
train_vectors_doc2vec = []
for i in range(len(model.dv)):
    train_vectors_doc2vec.append(model.dv[i])

In [68]:
test_vectors_doc2vec = []
for i in range(3000):
    test_vectors_doc2vec.append(model.infer_vector(test_sentences[i]))
    #test_vectors_doc2vec.append(model_test.dv[i])

In [66]:
model.infer_vector(test_sentences)

TypeError: sequence item 0: expected str instance, list found

In [70]:
# Evaluate the Doc2Vec model using Logistic Regression as the classifier (without normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(train_vectors_doc2vec, y_train)
y_predict = logreg.predict(test_vectors_doc2vec)

# Create new test dataframe
df_test_doc2vec = df_test.copy()
df_test_doc2vec['HS'] = y_predict

# Create prediction file for the doc2vec
df_test_doc2vec[['id', 'HS']].to_csv('predictions/doc2vec.tsv', sep='\t', index=False, header=False)
df_test_doc2vec[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the doc2vec
evaluate.write_eval("scores_doc2vec")

taskA_fscore: 0.5119115395863556
taskA_precision: 0.5336783804214841
taskA_recall: 0.5325944170771757
taskA_accuracy: 0.5126666666666667


In [78]:
# Normalize the data via StandardScaler

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_vectors_doc2vec)
X_train_scaled = scaler.transform(train_vectors_doc2vec)
X_test_scaled = scaler.transform(test_vectors_doc2vec)

In [76]:
# Evaluate the Doc2Vec model using Logistic Regression as the classifier (+ normalizing input data)

import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(X_train_scaled, y_train)
y_predict = logreg.predict(X_test_scaled)

# Create new test dataframe
df_test_doc2vec = df_test.copy()
df_test_doc2vec['HS'] = y_predict

# Create prediction file for the doc2vec
df_test_doc2vec[['id', 'HS']].to_csv('predictions/doc2vec.tsv', sep='\t', index=False, header=False)
df_test_doc2vec[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the doc2vec
evaluate.write_eval("scores_doc2vec")

taskA_fscore: 0.5119115395863556
taskA_precision: 0.5336783804214841
taskA_recall: 0.5325944170771757
taskA_accuracy: 0.5126666666666667
