In [1]:
import pandas as pd
import numpy as np
from preprocessing import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


# SVM Classifier with TF-IDF

### Read the data

In [2]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
# Unpack the data into text and stance
Train_X = train['text']
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')
# Perform the data preprocessing
test = clean_data(test)
# Unpack the data into text, and stance
Test_X = test['text']
stance_Test_Y = test['stance']
cat_Test_Y = test['category']


### Build the TFIDF

In [3]:
# Here we want to calculate the TF-IDF score for each word in the corpus

Tfidf_vect = TfidfVectorizer(max_features=27000)
Tfidf_vect.fit(train['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [4]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_Tfidf, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))


0.10.0
Counter({1: 5538, 0: 1012, -1: 438})
Counter({1: 5538, 0: 5538, -1: 5538})


### Classify without SMOTE

In [10]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,stance_Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print(classification_report(predictions_SVM, stance_Test_Y))

              precision    recall  f1-score   support

          -1       0.16      0.55      0.24        20
           0       0.17      0.53      0.25        40
           1       0.98      0.84      0.90       940

    accuracy                           0.82      1000
   macro avg       0.43      0.64      0.47      1000
weighted avg       0.93      0.82      0.86      1000



### Classify with SMOTE

In [11]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_stance_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print(classification_report(predictions_SMOTE_SVM, stance_Test_Y))

              precision    recall  f1-score   support

          -1       0.30      0.41      0.35        51
           0       0.40      0.42      0.41       120
           1       0.90      0.88      0.89       829

    accuracy                           0.80      1000
   macro avg       0.54      0.57      0.55      1000
weighted avg       0.81      0.80      0.81      1000



## For Category Classification

### Apply SMOTE to the training data to balance the classes

In [12]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(cat_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_cat_Train_Y = oversample.fit_resample(Train_X_Tfidf, cat_Train_Y)
print(Counter(SMOTE_cat_Train_Y))


0.10.0
Counter({'info_news': 3616, 'personal': 1025, 'celebrity': 975, 'plan': 606, 'unrelated': 323, 'others': 167, 'requests': 112, 'rumors': 79, 'advice': 67, 'restrictions': 18})
Counter({'celebrity': 3616, 'info_news': 3616, 'personal': 3616, 'unrelated': 3616, 'plan': 3616, 'requests': 3616, 'others': 3616, 'rumors': 3616, 'advice': 3616, 'restrictions': 3616})


### Classify without SMOTE

In [13]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,cat_Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print(classification_report(predictions_SVM, cat_Test_Y))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00         0
   celebrity       0.77      0.87      0.82       129
   info_news       0.91      0.68      0.78       735
      others       0.00      0.00      0.00         1
    personal       0.50      0.58      0.54       110
        plan       0.01      0.17      0.02         6
    requests       0.05      0.50      0.09         2
restrictions       0.00      0.00      0.00         0
      rumors       0.00      0.00      0.00         1
   unrelated       0.25      0.56      0.35        16

    accuracy                           0.68      1000
   macro avg       0.25      0.34      0.26      1000
weighted avg       0.83      0.68      0.74      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Classify with SMOTE

In [14]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SMOTE_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SMOTE_SVM.fit(SMOTE_Train_X_Tfidf,SMOTE_cat_Train_Y)
# predict the labels on validation dataset
predictions_SMOTE_SVM = SMOTE_SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print(classification_report(predictions_SMOTE_SVM, cat_Test_Y))

              precision    recall  f1-score   support

      advice       0.20      0.50      0.29         4
   celebrity       0.81      0.83      0.82       143
   info_news       0.72      0.72      0.72       547
      others       0.00      0.00      0.00         6
    personal       0.61      0.55      0.58       143
        plan       0.28      0.21      0.24       107
    requests       0.15      0.16      0.15        19
restrictions       0.50      0.33      0.40         3
      rumors       0.00      0.00      0.00         2
   unrelated       0.31      0.42      0.35        26

    accuracy                           0.63      1000
   macro avg       0.36      0.37      0.36      1000
weighted avg       0.64      0.63      0.63      1000

