In [2]:
import pandas as pd
import numpy as np
from preprocessing import *
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import LabelEncoder

# Naive Bayes Classifier with TFIDF

### Read The data

In [3]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
# Unpack the data into text and stance
Train_X = train['text']
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')
# Perform the data preprocessing
test = clean_data(test)
# Unpack the data into text, and stance
Test_X = test['text']
stance_Test_Y = test['stance']
cat_Test_Y = test['category']


### Build the TFIDF 

In [4]:
# Here we want to calculate the TF-IDF score for each word in the corpus

Tfidf_vect = TfidfVectorizer(max_features=27000)
Tfidf_vect.fit(train['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [5]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_Tfidf, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))


0.10.0
Counter({1: 5538, 0: 1012, -1: 438})
Counter({1: 5538, 0: 5538, -1: 5538})


### Classify without SMOTE

In [8]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_Tfidf.toarray(), stance_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_Tfidf.toarray())
print(classification_report(stance_Test_Y, predicted_NB))

              precision    recall  f1-score   support

          -1       0.19      0.21      0.20        70
           0       0.24      0.27      0.26       126
           1       0.84      0.82      0.83       804

    accuracy                           0.71      1000
   macro avg       0.42      0.43      0.43      1000
weighted avg       0.72      0.71      0.71      1000



In [9]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(predicted_NB, stance_Test_Y)*100)
print(Test_X[0])
print(stance_Test_Y[0])
print(predicted_NB[0])

Naive Bayes Accuracy Score ->  70.6
حظر خامنئي المجرم شراء يعد مجزرة متعمدة بحق الشعب الإيراني نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا الولايات المتحدة بريطانيا فرنسا
1
1


### Classify with SMOTE

In [11]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_Tfidf.toarray(), SMOTE_stance_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = SMOTE_NB.predict(Test_X_Tfidf.toarray())
print(classification_report(stance_Test_Y, predicted_NB))

              precision    recall  f1-score   support

          -1       0.19      0.21      0.20        70
           0       0.24      0.27      0.26       126
           1       0.84      0.82      0.83       804

    accuracy                           0.71      1000
   macro avg       0.42      0.43      0.43      1000
weighted avg       0.72      0.71      0.71      1000



In [12]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(predicted_NB, stance_Test_Y)*100)
print(Test_X[0])
print(stance_Test_Y[0])
print(predicted_NB[0])

Naive Bayes Accuracy Score ->  70.5
حظر خامنئي المجرم شراء يعد مجزرة متعمدة بحق الشعب الإيراني نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا الولايات المتحدة بريطانيا فرنسا
1
1


## For Category Classification

### Apply SMOTE to the training data to balance the classes

In [12]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(cat_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_cat_Train_Y = oversample.fit_resample(Train_X_Tfidf, cat_Train_Y)
print(Counter(SMOTE_cat_Train_Y))


0.10.0
Counter({'info_news': 3616, 'personal': 1025, 'celebrity': 975, 'plan': 606, 'unrelated': 323, 'others': 167, 'requests': 112, 'rumors': 79, 'advice': 67, 'restrictions': 18})
Counter({'celebrity': 3616, 'info_news': 3616, 'personal': 3616, 'unrelated': 3616, 'plan': 3616, 'requests': 3616, 'others': 3616, 'rumors': 3616, 'advice': 3616, 'restrictions': 3616})


### Classify without SMOTE

In [13]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_Tfidf.toarray(), cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_Tfidf.toarray())
print(classification_report(cat_Test_Y, predicted_NB))

              precision    recall  f1-score   support

      advice       0.38      0.30      0.33        10
   celebrity       0.72      0.57      0.64       145
   info_news       0.64      0.65      0.65       545
      others       0.07      0.06      0.06        17
    personal       0.36      0.39      0.37       128
        plan       0.15      0.20      0.17        82
    requests       0.12      0.10      0.11        20
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        15
   unrelated       0.38      0.31      0.34        36

    accuracy                           0.52      1000
   macro avg       0.28      0.26      0.27      1000
weighted avg       0.53      0.52      0.52      1000



In [14]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(predicted_NB, cat_Test_Y)*100)
print(Test_X[0])
print(cat_Test_Y[0])
print(predicted_NB[0])

Naive Bayes Accuracy Score ->  51.800000000000004
حظر خامنئي المجرم شراء يعد مجزرة متعمدة بحق الشعب الإيراني نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا الولايات المتحدة بريطانيا فرنسا
info_news
info_news


### Classify with SMOTE

In [17]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_Tfidf.toarray(), SMOTE_cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = SMOTE_NB.predict(Test_X_Tfidf.toarray())
print(classification_report(cat_Test_Y, predicted_NB))

              precision    recall  f1-score   support

      advice       0.38      0.30      0.33        10
   celebrity       0.75      0.57      0.65       145
   info_news       0.64      0.65      0.65       545
      others       0.07      0.06      0.06        17
    personal       0.36      0.39      0.37       128
        plan       0.16      0.22      0.18        82
    requests       0.12      0.10      0.11        20
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        15
   unrelated       0.38      0.31      0.34        36

    accuracy                           0.52      1000
   macro avg       0.29      0.26      0.27      1000
weighted avg       0.54      0.52      0.53      1000



In [18]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(predicted_NB, cat_Test_Y)*100)
print(Test_X[0])
print(cat_Test_Y[0])
print(predicted_NB[0])

Naive Bayes Accuracy Score ->  52.0
حظر خامنئي المجرم شراء يعد مجزرة متعمدة بحق الشعب الإيراني نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا الولايات المتحدة بريطانيا فرنسا
info_news
info_news
