In [128]:
import pandas as pd
import numpy as np
from preprocessing import *
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


# Sequence Model Classification with Skip-Gram Embeddings

### Read the data

In [129]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train = pd.read_csv('../Dataset/train.csv', encoding='utf-8')
# Perform the data preprocessing
train = clean_data(train)
# Unpack the data into text and stance
Train_X = train['text']
# Tokenize the data into words as Skip-Gram model requires a list of words as input
Train_X = [x.split(" ") for x in Train_X]

Train_Y = train['stance']


### Train the skip-gram model

In [130]:
# Here we want to apply Skip-Gram model to the data
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts

# Train the model
model = Word2Vec(sentences=Train_X, vector_size=300, window=5, min_count=1, workers=4, sg=1)
# Save the model
model.save("word2vec.model")
# Print the embedding of the word 'السلام'
print(model.wv['السلام'])

[ 0.00031788  0.04267718  0.10395221  0.06556573 -0.01839574 -0.11327147
  0.07991241  0.13209891 -0.0225396   0.02591489  0.02839504 -0.02254638
  0.02909214  0.054198   -0.03270419 -0.05826267  0.02614123 -0.01306893
 -0.00687586 -0.04866154 -0.09056102 -0.04129465  0.07997701  0.05743028
  0.09753807 -0.01187084 -0.09263406 -0.0430358   0.02595569 -0.08538613
 -0.00271649 -0.10501768  0.05235879  0.01491221 -0.01487462  0.00815292
 -0.01152762 -0.13246992 -0.02593228 -0.03179073 -0.05917642 -0.00933621
 -0.00244541 -0.0984536   0.03893042  0.02061627  0.03915191  0.00336432
 -0.04215703  0.12530503  0.0048435  -0.03009742 -0.09685623 -0.01080031
 -0.09422962  0.04628248  0.07838577 -0.03394053  0.01684492  0.01214107
 -0.01369861 -0.05872072 -0.00996804  0.04030126  0.03223577  0.04216023
  0.05538311 -0.03859715 -0.07914345 -0.02485975  0.02435783  0.05486184
  0.05725171 -0.07658459  0.02628965  0.01426283 -0.04362107 -0.00501695
 -0.03133858  0.03978909  0.01903093 -0.07668693  0

### Store the embeddings as features

In [132]:
# Loop over the training data and replace each word with its embedding
# Store the embedding in a different array
# Take a deep copy of the original data
import copy

Train_X_sg = copy.deepcopy(Train_X)
for i in range(len(Train_X)):
    for j in range(len(Train_X[i])):
        Train_X_sg[i][j] = model.wv[Train_X[i][j]]
print(Train_X_sg[0])
print(Train_X[0])

[array([-0.00908448,  0.03593525,  0.09334985,  0.05917785, -0.02136003,
       -0.10266771,  0.07593789,  0.12242024, -0.02119845,  0.02208195,
        0.02598651, -0.02528062,  0.02117235,  0.05125579, -0.02658333,
       -0.05315313,  0.03104931, -0.02141244, -0.00241534, -0.04184745,
       -0.08122224, -0.0347661 ,  0.06870685,  0.05651869,  0.0950068 ,
       -0.00711186, -0.08438874, -0.03038562,  0.02174269, -0.08158953,
       -0.00110166, -0.0976202 ,  0.04653447,  0.01229683, -0.01702439,
        0.00804844, -0.01044922, -0.12273192, -0.01937069, -0.03327033,
       -0.05806782, -0.01465361, -0.00592494, -0.09228081,  0.03258205,
        0.01477603,  0.03470181,  0.00335374, -0.04135336,  0.11153469,
       -0.0019757 , -0.02522603, -0.09010148, -0.00802619, -0.09068236,
        0.03935246,  0.07110494, -0.03120872,  0.02002211,  0.00467425,
       -0.01299398, -0.05077413, -0.00607243,  0.03489696,  0.0263832 ,
        0.03939748,  0.05357209, -0.03631271, -0.0742158 , -0.0

### Train the model agains the unbalanced data

In [5]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  81.8


In [6]:
# Calculate the F1 score for each class
from sklearn.metrics import f1_score
print("F1 score for each class -> ",f1_score(Test_Y, predictions_SVM, average=None))
# Calculate the Macro Average F1 score for the whole data
print("Macro Average F1 score -> ",f1_score(Test_Y, predictions_SVM, average='macro'))

F1 score for each class ->  [0.24444444 0.25301205 0.90137615]
Macro Average F1 score ->  0.46627754647540215


### Train the model against the balanced data

In [7]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SMOTE_SVM, Test_Y)*100)

SVM Accuracy Score ->  79.7


In [8]:
# Calculate the F1 score for each class
from sklearn.metrics import f1_score
print("F1 score for each class -> ",f1_score(Test_Y, predictions_SMOTE_SVM, average=None))
# Calculate the F1 score for the whole data
print("F1 score for the whole data -> ",f1_score(Test_Y, predictions_SMOTE_SVM, average='macro'))

F1 score for each class ->  [0.33898305 0.39516129 0.89106487]
F1 score for the whole data ->  0.5417364042170222
