**Mount Google Drive**

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Change Working Directory**

In [0]:
%cd '/content/gdrive/My Drive/NLPMaster'

/content/gdrive/My Drive/NLPMaster


**Download needed packages**

In [0]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/2a/f8/810ec35c31cca89bc4f1a02c14b042b9ec6c19dd21f7ef1876874ef069a6/tweet-preprocessor-0.5.0.tar.gz
Building wheels for collected packages: tweet-preprocessor
  Building wheel for tweet-preprocessor (setup.py) ... [?25l[?25hdone
  Created wheel for tweet-preprocessor: filename=tweet_preprocessor-0.5.0-cp36-none-any.whl size=7946 sha256=a02717088c7c9f1bdda6f9b3af5a603e7ac362ef439592d8b8e0278e549d88c2
  Stored in directory: /root/.cache/pip/wheels/1b/27/cc/49938e98a2470802ebdefae9d2b3f524768e970c1ebbe2dc4a
Successfully built tweet-preprocessor
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.5.0


**Import needed pachages**

In [0]:
import AraTweet
import preprocessor as p
import re
import gensim
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
import pickle
#Local packages
from utilities import *

deleted_reviews.tsv




# **Prepare Dataset**

**Load Dataset**

In [0]:
AraSent=AraTweet.AraTweet()
(raw_body, rating)=AraSent.read_clean_reviews()


deleted_reviews.tsv


## Preprocessing_01

In this section we are going to filter tweet from hashtags, emojis, urls. We will have two lists:<br>

*   body_with_hashtag: Tweets with deleting '#' and replace '_' with spaces.
*   body_without_hashtag: Tweets with deleting hashtag entirely.





**Prepare Filter**

In [0]:
p.set_options(
    p.OPT.URL, #Delete URLs
    p.OPT.EMOJI, #Delete Emojis
    p.OPT.HASHTAG, #Delete Hashtags
    p.OPT.SMILEY, #Delete Smily
    p.OPT.NUMBER) #Delete Digits

**Keep Hashtag**

In [0]:
#Get copy of raw data
body_with_hashtag = raw_body.copy()

for i in range(0, len(body_with_hashtag)):
  body_with_hashtag[i] = body_with_hashtag[i].replace("#", "").replace("_", " ")
  body_with_hashtag[i] = p.clean(body_with_hashtag[i])
body_with_hashtag[0:5]

['بعد استقالة رئيس المحكمة الدستورية ننتظر استقالة رئيس القضاء السودان',
 'أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر، بمناسبة صدور أولى روايته',
 'البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام العريان الي واشنطن شئ مقرف',
 'الحرية والعدالة | شاهد الآن: ليلة الاتحادية أول فيلم استقصائي يتناول أسرار و كواليس تعرض لأول مرة حول حقيقة',
 'الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقولها ملل الله وكيلك تعطيني محاضرة عن الفسق والفجور بجنوب الشيشان ليه كذا يانبع الحنان']

**Exclude Hashtag**

In [0]:
#Get copy of raw data
body_without_hashtag = raw_body.copy()

for i in range(0, len(body_without_hashtag)):
  body_without_hashtag[i] = p.clean(body_without_hashtag[i])
body_without_hashtag[0:5]

['بعد استقالة رئيس ننتظر استقالة',
 'أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر، بمناسبة صدور أولى روايته',
 'البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام العريان الي واشنطن شئ مقرف',
 '| شاهد الآن: أول فيلم استقصائي يتناول أسرار و كواليس تعرض لأول مرة حول حقيقة',
 'الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقولها ملل الله وكيلك تعطيني محاضرة عن الفسق والفجور بجنوب الشيشان كذا يانبع الحنان']

## Preprocessing_02

Remove non arabic characters and punctual marks

In [0]:
only_arabic_and_digits_re = r'[^0-9\u0621-\u064a\ufb50-\ufdff\ufe70-\ufefc]'

body_without_hashtag_arabic = body_without_hashtag.copy()
body_with_hashtag_arabic = body_with_hashtag.copy()

for i in range(len(body_without_hashtag_arabic)):
  body_without_hashtag_arabic[i] = re.sub(only_arabic_and_digits_re,' ', body_without_hashtag_arabic[i])
  body_with_hashtag_arabic[i] = re.sub(only_arabic_and_digits_re,' ', body_with_hashtag_arabic[i])
  #clean_str is function in AraVec Utlites
  #remove tashkeel and longation
  body_without_hashtag_arabic[i] = clean_str(body_without_hashtag_arabic[i])
  body_with_hashtag_arabic[i] = clean_str(body_with_hashtag_arabic[i])

## Preprocessing_03

Tokenizing using NLTK

In [0]:
tokenizer = TweetTokenizer()

In [0]:
body_without_hashtag_arabic_tokens = body_without_hashtag_arabic.copy()
body_with_hashtag_arabic_tokens = body_with_hashtag_arabic.copy()

for i in range(len(body_without_hashtag_arabic_tokens)):
  body_without_hashtag_arabic_tokens[i] = tokenizer.tokenize(body_without_hashtag_arabic_tokens[i])
  body_with_hashtag_arabic_tokens[i] = tokenizer.tokenize(body_with_hashtag_arabic_tokens[i])

# Words Representation

In [0]:
t_model = gensim.models.Word2Vec.load('AraVec/full_grams_cbow_300_twitter/full_grams_cbow_300_twitter.mdl')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
n_sample = len(body_with_hashtag_arabic_tokens)
max_seq_len = 50
wv_dim = 300

In [0]:
x_hashtag = np.zeros((n_sample, max_seq_len, wv_dim))
x_nohashtag = np.zeros((n_sample, max_seq_len, wv_dim))

In [0]:
for i in range(n_sample):
  for j in range(len(body_with_hashtag_arabic_tokens[i])):
    if body_with_hashtag_arabic_tokens[i][j] in t_model.wv:
      x_hashtag[i, j, :] = t_model.wv[body_with_hashtag_arabic_tokens[i][j]]
      
  for j in range(len(body_without_hashtag_arabic_tokens[i])):
    if body_without_hashtag_arabic_tokens[i][j] in t_model.wv:
      x_nohashtag[i, j, :] = t_model.wv[body_without_hashtag_arabic_tokens[i][j]]

In [0]:
del t_model

# Explore & Split Dataset 

In [0]:
uniqueValues, occurCount = np.unique(rating, return_counts=True)

print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  ['NEG' 'NEUTRAL' 'OBJ' 'POS']
Occurrence Count :  [1684  832 6691  799]


In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_hashtag, rating, test_size=0.2, random_state=0, stratify=rating)

In [0]:
uniqueValues_Y_train, occurCount_Y_train = np.unique(y_train, return_counts=True)

print("Unique Values : " , uniqueValues_Y_train)
print("Occurrence Count : ", occurCount_Y_train)

Unique Values :  ['NEG' 'NEUTRAL' 'OBJ' 'POS']
Occurrence Count :  [1347  666 5352  639]


**Resolve unbalanced data**

In [0]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train.reshape((-1, max_seq_len * wv_dim)), y_train)
X_resampled = X_resampled.reshape((-1, max_seq_len, wv_dim))

In [0]:
uniqueValues_resampled, occurCount_resampled = np.unique(y_resampled, return_counts=True)

print("Unique Values : " , uniqueValues_resampled)
print("Occurrence Count : ", occurCount_resampled)

Unique Values :  ['NEG' 'NEUTRAL' 'OBJ' 'POS']
Occurrence Count :  [5352 5352 5352 5352]


**Save data representation vercotrs**

In [0]:
with open('processed_data/X_resampled.pkl','wb') as f:
  pickle.dump(X_resampled, f)

with open('processed_data/y_resampled.pkl','wb') as f:
  pickle.dump(y_resampled, f)

with open('processed_data/X_test.pkl','wb') as f:
  pickle.dump(X_test, f)

with open('processed_data/y_test.pkl','wb') as f:
  pickle.dump(y_test, f)  