In [2]:
import pandas as pd
import numpy as np
from preprocessing import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# SVM with TF-IDF

In [8]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train = pd.read_csv('../Dataset/train.csv', encoding='utf-8')
# Perform the data preprocessing
train = clean_data(train)
# Unpack the data into text and stance
Train_X = train['text']
Train_Y = train['stance']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/dev.csv', encoding='utf-8')
# Perform the data preprocessing
test = clean_data(test)
# Unpack the data into text, and stance
Test_X = test['text']
Test_Y = test['stance']
# Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['text'],data['stance'],test_size=0.3)


In [9]:
# Here we want to calculate the TF-IDF score for each word in the corpus

Tfidf_vect = TfidfVectorizer(max_features=27000)
Tfidf_vect.fit(train['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


### Apply SMOTE to the training data to balance the classes

In [10]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_Train_Y = oversample.fit_resample(Train_X_Tfidf, Train_Y)
print(Counter(SMOTE_Train_Y))


0.10.0
Counter({1: 5538, 0: 1012, -1: 438})
Counter({1: 5538, 0: 5538, -1: 5538})


### Train the model agains the unbalanced data

In [11]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  81.8


In [12]:
# Calculate the F1 score for each class
from sklearn.metrics import f1_score
print("F1 score for each class -> ",f1_score(Test_Y, predictions_SVM, average=None))
# Calculate the Macro Average F1 score for the whole data
print("Macro Average F1 score -> ",f1_score(Test_Y, predictions_SVM, average='macro'))

F1 score for each class ->  [0.24444444 0.25301205 0.90137615]
Macro Average F1 score ->  0.46627754647540215


### Train the model against the balanced data

In [13]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SMOTE_SVM, Test_Y)*100)

SVM Accuracy Score ->  80.0


In [14]:
# Calculate the F1 score for each class
from sklearn.metrics import f1_score
print("F1 score for each class -> ",f1_score(Test_Y, predictions_SMOTE_SVM, average=None))
# Calculate the F1 score for the whole data
print("F1 score for the whole data -> ",f1_score(Test_Y, predictions_SMOTE_SVM, average='macro'))

F1 score for each class ->  [0.35294118 0.41295547 0.89106487]
F1 score for the whole data ->  0.552320504512887
