In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import LabelEncoder

# Naive Bayes Classifier with TFIDF

### Read The data

In [15]:
##############For Training Data#####################
# Read the data from the csv file named 'preprocessedData.csv' arabic data
train1 = pd.read_csv('../Dataset/cleaned_train.csv', encoding='utf-8')
train2 = pd.read_csv('../Dataset/cleaned_dev.csv', encoding='utf-8')

train = pd.concat([train1, train2], ignore_index=True)
# Unpack the data into text and stance
Train_X = train['text']
stance_Train_Y = train['stance']
cat_Train_Y = train['category']

##############For Testing Data#####################
test = pd.read_csv('../Dataset/cleaned_test_farasa.csv', encoding='utf-8')
# Unpack the data into text, and stance
Test_X = test['text']

### Build the TFIDF 

In [16]:
# Here we want to calculate the TF-IDF score for each word in the corpus

Tfidf_vect = TfidfVectorizer(max_features=27000)
Tfidf_vect.fit(train['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


## For Stance Classification

### Apply SMOTE to the training data to balance the classes

In [17]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(stance_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_stance_Train_Y = oversample.fit_resample(Train_X_Tfidf, stance_Train_Y)
print(Counter(SMOTE_stance_Train_Y))


0.10.0
Counter({1: 6342, 0: 1138, -1: 508})
Counter({1: 6342, 0: 6342, -1: 6342})


### Classify without SMOTE

In [18]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_Tfidf.toarray(), stance_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_Tfidf.toarray())

In [19]:
test['stance'] = predicted_NB

### Classify with SMOTE

In [7]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_Tfidf.toarray(), SMOTE_stance_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = SMOTE_NB.predict(Test_X_Tfidf.toarray())

In [8]:
print(Counter(predicted_NB))

Counter({1: 1517, 0: 332, -1: 151})


## For Category Classification

### Apply SMOTE to the training data to balance the classes

In [20]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
# check version number
import imblearn
print(imblearn.__version__)
# Count the number of each class
from collections import Counter
from imblearn.over_sampling import SMOTE
print(Counter(cat_Train_Y))
# transform the dataset
oversample = SMOTE()
SMOTE_Train_X_Tfidf, SMOTE_cat_Train_Y = oversample.fit_resample(Train_X_Tfidf, cat_Train_Y)
print(Counter(SMOTE_cat_Train_Y))


0.10.0
Counter({'info_news': 4161, 'personal': 1153, 'celebrity': 1120, 'plan': 688, 'unrelated': 359, 'others': 184, 'requests': 132, 'rumors': 94, 'advice': 77, 'restrictions': 20})
Counter({'celebrity': 4161, 'info_news': 4161, 'personal': 4161, 'unrelated': 4161, 'plan': 4161, 'requests': 4161, 'others': 4161, 'rumors': 4161, 'advice': 4161, 'restrictions': 4161})


### Classify without SMOTE

In [21]:
# instantiate the model (using the default parameters)
NB = GaussianNB()

# fit the model with data
NB.fit(Train_X_Tfidf.toarray(), cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = NB.predict(Test_X_Tfidf.toarray())

In [22]:
test['category'] = predicted_NB
test.to_csv('./output//NaiveBayes_TFIDF.csv', index=False)

### Classify with SMOTE

In [12]:
# instantiate the model (using the default parameters)
SMOTE_NB = GaussianNB()

# fit the model with data
SMOTE_NB.fit(SMOTE_Train_X_Tfidf.toarray(), SMOTE_cat_Train_Y)

##perform classification and prediction on samples in tf_test
predicted_NB = SMOTE_NB.predict(Test_X_Tfidf.toarray())

In [13]:
print(Counter(predicted_NB))

Counter({'info_news': 1249, 'personal': 278, 'plan': 170, 'celebrity': 144, 'unrelated': 47, 'advice': 47, 'others': 28, 'requests': 14, 'rumors': 13, 'restrictions': 10})
