In [25]:
import pandas as pd
import numpy as np
from preprocessing import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


# SVM Classifier with TF-IDF

### Read the data

In [26]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train1 = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
train2 = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')

train = pd.concat([train1, train2], ignore_index=True)
# Unpack the data into text and stance
Train_X = train['text']
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_test_farasa.csv', encoding='utf-8')
# Unpack the data into text, and stance
Test_X = test['text']

In [27]:
print("Train_X: ", Train_X)

Train_X:  0       بيل غيتس يتلقى لقاح تصوير الابرة السيرنجة الدو...
1       وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...
2       قولكن رح يكونو اد المسؤولية لبنان يوصل اللقاح ...
3       وزير الصحة فخر الدين قوجة يتلقى جرعة لقاح كورو...
4       وئام وهاب يشتم الدول الخليجية طلة اعلامية ويتس...
                              ...                        
7983    ينبغي للمعلمين يكونوا أوائل سيتاح الحصول لقاح ...
7984    عاجل دراسة بريطانية لقاح أسترازينيكا يوفر حماي...
7985    دبي تبدأ حملة تطعيم بلقاح شركة سينوفارم الصيني...
7986    یجب نجلس بحاجة إلی الذهاب لاعدائنا لسد احتیاجا...
7987    وسط ضجيج اللقاحات تسجل يوميا دول العالم وخصوصا...
Name: text, Length: 7988, dtype: object


### Build the TFIDF

In [28]:
# Here we want to calculate the TF-IDF score for each word in the corpus

Tfidf_vect = TfidfVectorizer(max_features=27000)
Tfidf_vect.fit(train['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [29]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_Tfidf, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))


0.10.0
Counter({1: 6342, 0: 1138, -1: 508})
Counter({1: 6342, 0: 6342, -1: 6342})


### Classify without SMOTE

In [30]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SVM.fit(Train_X_Tfidf,stance_Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [31]:
print(Counter(predictions_SVM))

Counter({1: 1925, 0: 48, -1: 27})


### Classify with SMOTE

In [32]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_stance_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)

In [33]:
print(Counter(predictions_SMOTE_SVM))

Counter({1: 1636, 0: 301, -1: 63})


In [34]:
# Save the predictions to a csv file
test['stance'] = predictions_SMOTE_SVM

## For Category Classification

### Apply SMOTE to the training data to balance the classes

In [35]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(cat_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_cat_Train_Y = oversample.fit_resample(Train_X_Tfidf, cat_Train_Y)
print(Counter(SMOTE_cat_Train_Y))


0.10.0
Counter({'info_news': 4161, 'personal': 1153, 'celebrity': 1120, 'plan': 688, 'unrelated': 359, 'others': 184, 'requests': 132, 'rumors': 94, 'advice': 77, 'restrictions': 20})
Counter({'celebrity': 4161, 'info_news': 4161, 'personal': 4161, 'unrelated': 4161, 'plan': 4161, 'requests': 4161, 'others': 4161, 'rumors': 4161, 'advice': 4161, 'restrictions': 4161})


### Classify without SMOTE

In [36]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SVM.fit(Train_X_Tfidf,cat_Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [37]:
print(Counter(predictions_SVM))

Counter({'info_news': 1682, 'celebrity': 177, 'personal': 132, 'unrelated': 6, 'plan': 3})


### Classify with SMOTE

In [38]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_cat_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)


In [39]:
print(Counter(predictions_SMOTE_SVM))

Counter({'info_news': 1378, 'celebrity': 219, 'personal': 188, 'plan': 151, 'unrelated': 29, 'others': 29, 'requests': 5, 'rumors': 1})


In [41]:
# Save the predictions to a csv file
test['category'] = predictions_SMOTE_SVM
test.to_csv('../Dataset/predictions_SVM_TFIDF.csv', index=False)