## Text preprocessing

### Install required libraries

In [1]:
# !pip install -U nltk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fatemehnadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/fatemehnadi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
#from imblearn.over_sampling import  SMOTE


import random 
import numpy as np
import nltk
import pandas as pd
import codecs
import tqdm
import re

from nltk import FreqDist
import itertools

from collections.abc import Sequence
#from nltk.lm.preprocessing import padded_everygram_pipeline
#from nltk.lm import MLE
from nltk.tokenize.treebank import TreebankWordDetokenizer

from nltk import word_tokenize
#from nltk.lm import MLE, Laplace
#from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline



from __future__ import unicode_literals
from hazm import *

### read file

In [3]:
data = pd.read_csv("./Snappfood - Sentiment Analysis.csv" , on_bad_lines='skip' , delimiter='\t')

data = data[['comment', 'label_id']] # remove Unamed column
data = data.dropna()                            # drop nan
data['label_id'] = data['label_id'].astype(int) # convert to int

# show data
data.head()

Unnamed: 0,comment,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,1
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,0
4,شیرینی وانیلی فقط یک مدل بود.,0


In [4]:
data.groupby(['label_id'])['label_id'].count()

label_id
0    34916
1    34564
Name: label_id, dtype: int64

### Sampling

In [5]:
def sampling_balanced(data, sample_size=0.1):
    
    # shuffle the DataFrame rows & return all rows
    data = data.sample(frac = 1)
    data.reset_index(drop=True, inplace=True) 
    
    data_pos = data[data['label_id']==0]
    data_neg = data[data['label_id']==1]
    
    no_pos = int(np.floor((data.groupby(['label_id'])['label_id'].count()[0] * sample_size)))
    no_neg = int(np.floor((data.groupby(['label_id'])['label_id'].count()[1] * sample_size)))
    
    
    data_pos_train = data_pos[:no_pos]
    data_neg_train = data_neg[:no_neg]
    
    
    data_pos_test = data_pos[no_pos+1:]
    data_neg_test = data_neg[no_neg+1:]
    
    # train
    df_train = pd.concat([data_pos_train, data_neg_train])
    df_train.reset_index(drop=True, inplace=True) # reset index
    
    
    # test
    df_test = pd.concat([data_pos_test, data_neg_test])
    df_test.reset_index(drop=True, inplace=True) # reset index
    
    
    
    return df_train, df_test 

In [6]:
sample_data, _  = sampling_balanced(data,sample_size=0.1)

### preprocess persian text

In [7]:
#!pip install hazm

Load stop words

In [8]:
def stopwords_list(stopwords_file):
    with open(stopwords_file, 'r', encoding='utf-8') as file:
        stopwords = file.read().split('\n')
    return stopwords

Remove and change some characters

In [9]:
def replace_chars(comment):
    comment = re.sub(r'[^\w\s]', ' ', comment) 
    comment = re.sub(r'\d+', ' ', comment)  
    comment = re.sub(r'[a-zA-Z]', ' ', comment)
    comment = re.sub(r'[0-9]+', '', comment)
    comment = re.sub(r'[يى]', 'ی', comment) 
    comment = re.sub(r'[ك]', 'ک', comment) 
    comment = re.sub(r'[گ]', 'گ', comment)  
    comment = re.sub(r'\s+', ' ', comment) 
    #comment = re.sub(r'[۰-۹a-zA-Z]+', '', comment)  # remove extra spaces
    return comment

Preprocess a single comment

In [10]:
def preprocess_comment(comment):
    
    norm_comment = replace_chars(comment)
    
    # num_removed = remove_numbers(norm_comment)
    # Normalize the comment
    normalizer = Normalizer()
    norm_comment = normalizer.normalize(norm_comment)
    words = word_tokenize(norm_comment)
    stop_words = stopwords_list("./short_stop_words.txt")
    '''
    my_stopwords = ['هم', 'که','آن', 'را' , 'برای','مثلا']
    default_stopwords = hazm.stopwords_list() + my_stopwords
    
    my_stopwords = ['عالی', 'بد', 'زیادی', 'بهتر', 'همه', 'باعث', 'معمولی', 'خیلی','خوب', 'یک','بی','ی']
    stopwords = [word for word in default_stopwords if word not in my_stopwords]

    '''

    filtered_words = [word for word in words if word not in stop_words and not bool(re.search('[\W_]', word))]

    lemmatizer = Lemmatizer()
    farsi_words = []
    for word in filtered_words:
        if not bool(re.search('[a-zA-Z]', word)):
            farsi_words.append(lemmatizer.lemmatize(word))

    return ' '.join(farsi_words)

Apply preprocess on data

In [11]:
from hazm import *

'''# Tokenize words
def tokenize_words(comment):
    return word_tokenize(comment)'''



# Preprocess a single comment


# Preprocess a list of comments
def preprocess_comments(comments):
    preprocessed_comments = []
    for comment in comments:
        preprocessed_comment = preprocess_comment(comment)
        preprocessed_comments.append(preprocessed_comment)
    return preprocessed_comments



In [12]:

comments = sample_data['comment'].tolist()
labels = sample_data['label_id'].tolist()
preprocessed_comments = preprocess_comments(comments)
preprocessed_df = pd.DataFrame({'comment': preprocessed_comments, 'label_id': labels})


split dataset to test and train

## Part1 :  TF-IDF

### TF

In [13]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
        
    return tfDict

In [14]:
def get_TF(df):

    bagOfw = df.comment.apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)
    bagOfw

    uniqueWords =  set((bagOfw.index).to_list())
    print("No. of uniqueWords: ",len(uniqueWords))

    numOfWords_list = [] 
    bagOfWords_list = []

    
    tf_list = [] # The frequency of each word in each document is stored in a dictionary

    for i in range(0,df.shape[0]):

        bagOfWords = str(df.iloc[i]['comment']).split(' ')

        numOfWords = dict.fromkeys(uniqueWords, 0)
        for word in bagOfWords:
            numOfWords[word] += 1

        tf = computeTF(numOfWords, bagOfWords)

        bagOfWords_list.append(bagOfWords)
        numOfWords_list.append(numOfWords)
        tf_list.append(tf)

    return tf_list
    

In [15]:
tf_list = get_TF(preprocessed_df)

No. of uniqueWords:  6255


### IDF

In [16]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
        
    return idfDict

In [17]:
idfs = computeIDF(tf_list)

### Tf_IDF

In [18]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [19]:
tfidf_list = []

for i in range(0,preprocessed_df.shape[0]):
    tfidf = computeTFIDF(tf_list[i], idfs)
    tfidf_list.append(tfidf)

tfidf_df = pd.DataFrame(tfidf_list)

In [20]:
final_TFIDF = pd.concat([sample_data, tfidf_df], axis=1)
final_TFIDF.head()

Unnamed: 0,comment,label_id,Unnamed: 3,تضمین,ساخت,کلاخمیربود,صداقتتون,خورمش,بیکران,نارنجک,...,چهارتادونه,ممنونممم,دونفرسیرمیکنه,خشکه,بیکن,ناکافی,زغال,انجا,پرانتز,قیمتا
0,تاخیر خییییییلی زیاد … خوشمزه بود,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,بنده در عرض پنج دقیقه دو تا سفارش از هایپر میو...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,بسیار عالی، بسته بندی خوب و حرفه‌ای، تنوع غذای...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,کیفیت غذا قبلا بهتر بود,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,خوب بود فقط خیلی قطور بود که باعث میشد دل آدم ...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


split the data into training and test sets

In [21]:
X = tfidf_df
y =  preprocessed_df['label_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

Naive Bayes Classifier

calculate F1 score, precision, recall, and accuracy

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.82      0.67       345
           1       0.68      0.38      0.49       350

    accuracy                           0.60       695
   macro avg       0.62      0.60      0.58       695
weighted avg       0.62      0.60      0.58       695



## Part 2 : PPMI

remove empty comments

In [23]:
preprocessed_df = preprocessed_df[preprocessed_df['comment'].str.strip() != '']

In [24]:
def get_ppmi(df):
    
    words = list(set([word.lower() for sentence in df['comment'] for word in sentence.split()])) # unique words

    #  dictionary:  word frequencies for each sentence
    freq_dict = {}
    for i, sentence in enumerate(df['comment']):
        freq_dict[i] = {}
        for word in sentence.split():
            word = word.lower()
            if word in freq_dict[i]:
                freq_dict[i][word] += 1
            else:
                freq_dict[i][word] = 1

    
    matrix = [] # |comment| * |V|
    for i in range(len(df)):
        row = []
        for word in words:
            if word in freq_dict[i]:
                row.append(freq_dict[i][word])
            else:
                row.append(0)
        matrix.append(row)
    matrix_df = pd.DataFrame(matrix, columns=words)
    
    matrixVV = np.dot(matrix_df.T,matrix_df) # |V|*|V|
    total_words = sum(sum(matrixVV))
    
    matrixVV = matrixVV/total_words # to reach possibility
    matrix_pmi = matrixVV # |V|*|V|


    
    row_sums = np.sum(matrixVV, axis=1) # sum of each row 
    col_sums = np.sum(matrixVV, axis=0) # sum of each column

    matrixVV_with_row_sums = np.append(matrixVV, np.atleast_2d(row_sums).T, axis=1) # add to MatrixVV
    col_sums = np.append(col_sums, np.sum(row_sums))
    col_sums_2d = np.reshape(col_sums, (1, col_sums.shape[0]))
    matrixVV_with_row_sums = np.append(matrixVV_with_row_sums, col_sums_2d, axis=0)


    # print the result
    # print(matrixVV_with_row_sums)

    return matrixVV_with_row_sums, matrix_pmi, words


In [25]:
x, y, words = get_ppmi(preprocessed_df)

calulate pmi for each word in vocab

p(t1,t2)/(p(t1)*p(t2))

In [26]:
for i in range(0,x.shape[0]-1):
    for j in range(0,y.shape[1]-1):
        res = x[i][j]/(x[i][-1]*x[-1][j])
        if res !=0:
            pmi = np.log(res)
            #print(pmi)
            if pmi>0:
                y[i][j] = pmi
            else:
                y[i][j] = 0
        else:
            y[i][j] = 0
    
ppmi_df = pd.DataFrame(y, columns=words, index=words)
ppmi_df

Unnamed: 0,تضمین,ساخت,کلاخمیربود,صداقتتون,خورمش,بیکران,نارنجک,کارکنانش,شدت,گلابی,...,چهارتادونه,ممنونممم,دونفرسیرمیکنه,خشکه,بیکن,ناکافی,زغال,انجا,پرانتز,قیمتا
تضمین,10.921983,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
ساخت,0.000000,7.512487,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
کلاخمیربود,0.000000,0.000000,8.149394,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
صداقتتون,0.000000,0.000000,0.000000,8.416457,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
خورمش,0.000000,0.000000,0.000000,0.000000,8.028145,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ناکافی,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,7.338464,0.000000,0.000000,0.000000,0.000000
زغال,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,8.971618,0.000000,0.000000,0.000000
انجا,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,8.724758,0.000000,0.000000
پرانتز,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,3.937845,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,8.898781,0.000000


create a vector for each vocab for each comment by mean on vectors of words of a sentence.

In [27]:
def get_pmi(df, ppmi_df):
    
    result = pd.DataFrame()
    
    for index, sen in df.iterrows():
        text = sen['comment']
    
        tokens = hazm.word_tokenize(text)
        n = len(tokens)
        l = []
        for word in tokens:
            xx = ppmi_df[str(word)]
            xx = xx.values
            l.append(xx)
        
        l = sum(l)/n
        l = l.reshape(1, -1)
        ll = pd.DataFrame(l, columns=words)
        ll["label_id"] = sen['label_id']
        result = pd.concat([result,ll], axis=0)
    return result

In [28]:
# ! pip install hazm

In [29]:
import hazm

df_ppmi = get_pmi(preprocessed_df, ppmi_df)

In [30]:
df_ppmi
sum(df_ppmi.iloc[13])
y = df_ppmi.label_id
X = df_ppmi.drop('label_id', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

Naive Bayes Classifier

In [31]:
clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.61      0.72       359
           1       0.68      0.91      0.78       331

    accuracy                           0.76       690
   macro avg       0.78      0.76      0.75       690
weighted avg       0.79      0.76      0.75       690

