In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
from nltk.stem.isri import ISRIStemmer
import nltk

## Pre-Processing Data

In [11]:
# Read the data from the csv file named 'preprocessedData.csv' arabic data
data = pd.read_csv('./Dataset/preprocessingData.csv', encoding='utf-8')
# Remove [ ] from each word in the data
# data['text'] = data['text'].str.replace('[^\w\s]', '') 

In [12]:
from farasa.stemmer import FarasaStemmer
stemmer = FarasaStemmer()

In [13]:

for i in range(len(data['text'])):
    # remove the Links
    newTweet=re.sub(r'http\S+', '', data['text'][i])
    # remove english letters
    newTweet=re.sub(r'[a-zA-Z]', '', newTweet)
    # remove numbers
    newTweet=re.sub(r'[0-9]', '', newTweet)
    # remove the arabic numbers
    newTweet=re.sub(r'[\u0660-\u0669]', '', newTweet)
    # remove the emails
    newTweet=re.sub(r'\S*@\S*\s?', '', newTweet)
    # remove the hashtags
    newTweet=re.sub(r'#\S+', '', newTweet)
    # remove the mentions
    newTweet=re.sub(r'@\S+', '', newTweet)
    # remove emojis
    RE_EMOJI = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    newTweet=RE_EMOJI.sub(r'', newTweet)
    # replace _ by whitespace
    newTweet=re.sub(r'_', ' ', newTweet)
    
    # remove the punctuations
    newTweet = re.sub('\W+',' ', newTweet)

    # remove duplicated whitespaces
    newTweet=re.sub(r'\s+', ' ', newTweet)

    # applay tokenization
    newTweet=newTweet.split()
    
    # remove stopwords
    stopwords_arabic = nltk.corpus.stopwords.words('arabic')
    stopwords_arabic.append('ال')
    newTweet=[word for word in newTweet if word not in stopwords_arabic]

    # join the words
    newTweet=' '.join(newTweet)

    # apply stemming
    # newTweet=stemmer.stem(newTweet)
    
    # add the new tweet to the list
    data['text'][i]=newTweet


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i]=newTweet


## SVM with TF-IDF

In [14]:
# Unpack the data into text, category and stance
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['text'],data['stance'],test_size=0.3)

In [15]:
# Here we want to calculate the TF-IDF score for each word in the corpus
Tfidf_vect = TfidfVectorizer(max_features=27000)


Tfidf_vect.fit(data['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


# Store the TF-IDF score in a csv file
df = pd.DataFrame(Train_X_Tfidf.toarray())
df.to_csv('./Features/tf_idf.csv', index=False)

بيل غيتس يتلقى لقاح تصوير الابرة السيرنجة الدواء لابس بولو صيفي عز الشتاء يقول ان مزايا عمر عام انه مؤهل للحصول اللقاح يعنى يحتاج اللقاح عمره اصغر


## Apply SMOTE to the training data to balance the classes

In [16]:
# Here we want to apply SMOTE to the data to balance the data against 3 classes
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(Train_X_Tfidf, Train_Y)

## Train the model

In [17]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
# What is the best kernel for SVM in the case of multi-class classification? - Quora
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_res,y_res)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  79.87601335240821


In [13]:
# Calculate the F1 score for each class
from sklearn.metrics import f1_score
print("F1 score for each class -> ",f1_score(Test_Y, predictions_SVM, average=None))

F1 score for each class ->  [0.3853211  0.30674847 0.88156008]
