In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Naive Bayes Classifier with Bag Of Words

### Read The data

In [25]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train1 = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
train2 = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')

train = pd.concat([train1, train2], ignore_index=True)
# Unpack the data into text and stance
Train_X = train['text']
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_test_farasa.csv', encoding='utf-8')
# Unpack the data into text, and stance
Test_X = test['text']

In [26]:
print("Train_X: ", Train_X)

Train_X:  0       بيل غيتس يتلقى لقاح تصوير الابرة السيرنجة الدو...
1       وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...
2       قولكن رح يكونو اد المسؤولية لبنان يوصل اللقاح ...
3       وزير الصحة فخر الدين قوجة يتلقى جرعة لقاح كورو...
4       وئام وهاب يشتم الدول الخليجية طلة اعلامية ويتس...
                              ...                        
7983    ينبغي للمعلمين يكونوا أوائل سيتاح الحصول لقاح ...
7984    عاجل دراسة بريطانية لقاح أسترازينيكا يوفر حماي...
7985    دبي تبدأ حملة تطعيم بلقاح شركة سينوفارم الصيني...
7986    یجب نجلس بحاجة إلی الذهاب لاعدائنا لسد احتیاجا...
7987    وسط ضجيج اللقاحات تسجل يوميا دول العالم وخصوصا...
Name: text, Length: 7988, dtype: object


### Build the BOW 

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

#Assigning CountVectorizer to tvariable
count_vect = CountVectorizer().fit(Train_X)

#Fitting and transforming input data
Train_X_BOW = count_vect.transform(Train_X).toarray()
Test_X_BOW = count_vect.transform(Test_X).toarray()

## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [28]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_BOW, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_BOW, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))


0.10.0
Counter({1: 6342, 0: 1138, -1: 508})
Counter({1: 6342, 0: 6342, -1: 6342})


### Classify without SMOTE

In [29]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_BOW, stance_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_BOW)

In [30]:
print(Counter(predicted_NB))

Counter({1: 1514, 0: 337, -1: 149})


In [31]:
test['stance'] = predicted_NB

### Classify with SMOTE

In [16]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_BOW, SMOTE_stance_Train_Y)

##perform classification and prediction on samples in tf_test
SMOTE_predicted_NB = SMOTE_NB.predict(Test_X_BOW)

In [18]:
print(Counter(SMOTE_predicted_NB))

Counter({1: 1514, 0: 337, -1: 149})


## For Category Classification

### Apply SMOTE to the training data to balance the classes

In [32]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(cat_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_BOW, SMOTE_cat_Train_Y = oversample.fit_resample(Train_X_BOW, cat_Train_Y)
print(Counter(SMOTE_cat_Train_Y))


0.10.0
Counter({'info_news': 4161, 'personal': 1153, 'celebrity': 1120, 'plan': 688, 'unrelated': 359, 'others': 184, 'requests': 132, 'rumors': 94, 'advice': 77, 'restrictions': 20})
Counter({'celebrity': 4161, 'info_news': 4161, 'personal': 4161, 'unrelated': 4161, 'plan': 4161, 'requests': 4161, 'others': 4161, 'rumors': 4161, 'advice': 4161, 'restrictions': 4161})


### Classify without SMOTE

In [20]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_BOW, cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_BOW)

In [21]:
print(Counter(predicted_NB))

Counter({'info_news': 1239, 'personal': 282, 'celebrity': 180, 'plan': 144, 'advice': 47, 'unrelated': 42, 'others': 31, 'rumors': 14, 'requests': 14, 'restrictions': 7})


### Classify with SMOTE

In [33]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_BOW, SMOTE_cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = SMOTE_NB.predict(Test_X_BOW)

In [34]:
print(Counter(predicted_NB))

Counter({'info_news': 1239, 'personal': 282, 'celebrity': 175, 'plan': 152, 'advice': 47, 'unrelated': 39, 'others': 31, 'requests': 15, 'rumors': 13, 'restrictions': 7})


In [36]:
test['category'] = predicted_NB
test.to_csv('./output/NaiveBayes_BOW.csv', index=False)