<a href="https://colab.research.google.com/github/ysfesr/YT-COMMENTS-ANALYSIS/blob/master/Arabic_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv('drive/MyDrive/colab notebook/Arabe Sentiment Analysis/datasets/arabic_sentiment.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,0,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم...
1,1,0,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين ...
2,2,0,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال و...
3,3,0,نعمة المضادات الحيوية . تضع قطرة💧مضاد بنسلين ع...
4,4,0,الدودو جايه تكمل علي 💔


In [3]:
stop_words = set(stopwords.words('arabic'))

def remove_diacritics(text):
    arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', str(text))
    return text

def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    text = remove_emoji(text)
    text = remove_diacritics(text)
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text

In [4]:
data = data.drop('Unnamed: 0', axis=1)
data['cleaned_text'] = data['text'].apply(clean_text)
data.sample(5)

Unnamed: 0,label,text,cleaned_text
23431,1,سحب على مبلغ مالي 💰 لمتابعي #كشكول 👍🏻 المطلوب:...,سحب مبلغ مالي لمتابعي كشكول المطلوب شي رتويت ا...
24029,1,افف كيوت مرا,افف كيوت مرا
35487,1,اهبل مايعرفني باقي 😏,اهبل مايعرفني باقي
20589,0,اصطبحي وقولي يا صبح 😏,اصطبحي وقولي صبح
26301,1,لاعب #الهلال كنو في مباراة الامس: التمريرات ✅ ...,لاعب الهلال كنو مباراة الامس التمريرات ✅ دقة ا...


In [5]:
# split the data to tain and test data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], test_size=0.3, stratify=data['label'])

In [6]:
steps = [('vectorizer', TfidfVectorizer(min_df=0.0001, max_df=0.95, analyzer='word', lowercase=False)), ('cls', RandomForestClassifier())]
pipeline = Pipeline(steps) # define the pipeline object.
parameteres = {'vectorizer__max_features':[1000,3000,7000,10000]}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=0.95,
                                                        max_features=None,
                                                        min_df=0.0001,
                                                        ngram_range=(1, 1),
                                             

In [7]:
print("best estimator: ",grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=0.95,
                                 max_features=10000, min_df=0.0001,
                                 ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 to...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                           

In [8]:
y_pred = grid.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

Precision: 0.799 / Recall: 0.749 / Accuracy: 0.779


In [11]:
from sklearn.externals import joblib

model_save_name = 'sentiment_classifier.pkl'
path = F"drive/MyDrive/colab notebook/Arabe Sentiment Analysis/{model_save_name}"

joblib.dump(grid.best_estimator_, path)

['drive/MyDrive/colab notebook/Arabe Sentiment Analysis/sentiment_classifier.pkl']