In [1]:
from __future__ import unicode_literals
from hazm import *
import pandas as pd
import numpy as np
import collections

In [2]:
train_data = pd.read_csv ('comment_train.csv')
test_data = pd.read_csv ('comment_test.csv')

In [3]:
recommend_train = train_data[(train_data['recommend'] == 'recommended')]
recommend_count = recommend_train.count()['title']
not_recommend_train = train_data[(train_data['recommend'] == 'not_recommended')]
not_recommend_count = not_recommend_train.count()['title']

normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()

def normalize_str(str):
    data = []
    normalizer = Normalizer()
    data = word_tokenize(normalizer.normalize(str))

    for i in range(len(data)):
        data[i] = lemmatizer.lemmatize(stemmer.stem(data[i]))
    
    return data

In [4]:
def calc_freq(data):
    count = dict(collections.Counter(data))
    return count    

def normalize_data(flag):    
    rec_words = []
    not_rec_words = []

    for index,row in recommend_train.iterrows():
        if flag == 1: rec_words += normalize_str(row['title'] + ' ' + row['comment'])
        else: rec_words += word_tokenize(row['title'] + ' ' + row['comment'])
        
    for index,row in not_recommend_train.iterrows():
        if flag == 1: not_rec_words += normalize_str(row['title'] + ' ' + row['comment'])
        else: not_rec_words += word_tokenize(row['title'] + ' ' + row['comment'])
        
    rec_data = calc_freq(rec_words)
    not_rec_data = calc_freq(not_rec_words)

    test = test_data.copy()

    for index,row in test.iterrows():
        if flag == 1: temp = normalize_str(row['title'] + ' ' + row['comment'])
        else: temp = word_tokenize(row['title'] + ' ' + row['comment'])
        for i in temp:
            if i not in rec_data: rec_data[i] = 0
            if i not in not_rec_data: not_rec_data[i] = 0
                
    return rec_data,not_rec_data,test


In [5]:
def additive_smoothing(rec_data,not_rec_data):
    rec = {}
    not_rec = {}
    for i in rec_data:
        rec[i] = rec_data[i] + 1
    for i in not_rec_data:
        not_rec[i] = not_rec_data[i] + 1
    return rec, not_rec

In [6]:
def sum_dict(dic):
    sum = 0
    for i in dic: 
        sum = sum + dic[i]
    return sum

def rec_or_not(test_,rec_data,not_rec_data,flag):
    rec_words_count = sum_dict(rec_data)
    not_rec_words_count = sum_dict(not_rec_data)
    for index,row in test_.iterrows():
        rec_prob = recommend_count/(recommend_count+not_recommend_count)
        not_rec_prob = not_recommend_count/(recommend_count+not_recommend_count)
        if flag == 1: temp = normalize_str(row['title'] + ' ' + row['comment'])
        else: temp = word_tokenize(row['title'] + ' ' + row['comment'])
        for i in temp:
            rec_prob *= (rec_data[i]/rec_words_count)
            not_rec_prob *= (not_rec_data[i]/not_rec_words_count)
            if rec_prob >= not_rec_prob: row['recommend'] = 'recommended'
            else: row['recommend'] = 'not_recommended'
    return test_

In [7]:
def accuracy(test):
    wrong = []
    right = 0
    for i in range(test.count()['title']):
        if test['recommend'][i] == test_data['recommend'][i] : right += 1
        else: wrong.append((test['title'][i],test['comment'][i],test['recommend'][i]))
    return right/test.count()['title'],wrong

def precision(test):
    right = 0
    for i in range(test.count()['title']):
        if test['recommend'][i] == test_data['recommend'][i] and test['recommend'][i] == 'recommended': right += 1
    return right/(test[(test['recommend'] == 'recommended')].count()['title'])

def recall(test):
    right = 0
    for i in range(test.count()['title']):
        if test['recommend'][i] == test_data['recommend'][i] and test['recommend'][i] == 'recommended': right += 1
    return right/(test_data[(test_data['recommend'] == 'recommended')].count()['title'])

def f1(pre,rec):
    return (2*pre*rec)/(pre+rec)

In [8]:
#pre process and additive smoothing
rec_data_1,not_rec_data_1,test_1 = normalize_data(1)

both_rec, both_not_rec = additive_smoothing(rec_data_1,not_rec_data_1)

filled_test_both = rec_or_not(test_1,both_rec,both_not_rec,1)

acc_1,wrong = accuracy(filled_test_both)
prec_1 = precision(filled_test_both)
rec_1 = recall(filled_test_both)
f1_1 = f1(prec_1,rec_1)

print("pre process and additive smoothing:")
print ("accuracy : ",acc_1)
print ("precision: ",prec_1)
print ("recall : ",rec_1)
print ("F1 : ",f1_1)

pre process and additive smoothing:
accuracy :  0.9175
precision:  0.8938679245283019
recall :  0.9475
F1 :  0.9199029126213593


In [9]:
#additive smoothing
rec_data_2,not_rec_data_2,test_2 = normalize_data(0)

additive_rec, additive_not_rec = additive_smoothing(rec_data_2,not_rec_data_2)

filled_test_additive = rec_or_not(test_2,additive_rec,additive_not_rec,0)

acc_2,r = accuracy(filled_test_additive)
prec_2 = precision(filled_test_additive)
rec_2 = recall(filled_test_additive)
f1_2 = f1(prec_2,rec_2)

print("additive smoothing:")
print ("accuracy : ",acc_2)
print ("precision: ",prec_2)
print ("recall : ",rec_2)
print ("F1 : ",f1_2)

additive smoothing:
accuracy :  0.91375
precision:  0.8857808857808858
recall :  0.95
F1 :  0.916767189384801


In [10]:
#pre process
rec_data_3,not_rec_data_3,test_3 = normalize_data(1)

filled_test_pre = rec_or_not(test_3,rec_data_3,not_rec_data_3,1)

acc_3,r = accuracy(filled_test_pre)
prec_3 = precision(filled_test_pre)
rec_3 = recall(filled_test_pre)
f1_3 = f1(prec_3,rec_3)

print("pre process:")
print ("accuracy : ",acc_3)
print ("precision: ",prec_3)
print ("recall : ",rec_3)
print ("F1 : ",f1_3)

pre process:
accuracy :  0.86
precision:  0.7950819672131147
recall :  0.97
F1 :  0.8738738738738738


In [11]:
#none
rec_data_4,not_rec_data_4,test_4 = normalize_data(0)

filled_test_none = rec_or_not(test_4,rec_data_4,not_rec_data_4,0)

acc_4,r = accuracy(filled_test_none)
prec_4 = precision(filled_test_none)
rec_4 = recall(filled_test_none)
f1_4 = f1(prec_4,rec_4)

print("none:")
print ("accuracy : ",acc_4)
print ("precision: ",prec_4)
print ("recall : ",rec_4)
print ("F1 : ",f1_4)

none:
accuracy :  0.85625
precision:  0.7844311377245509
recall :  0.9825
F1 :  0.872364039955605


In [12]:
df = pd.DataFrame(wrong, columns =['title', 'comment', 'recommend']) 
df

Unnamed: 0,title,comment,recommend
0,وری گود,تازه خریدم یه مدت کار بکنه مشخص میشه کیفیت قطعاتش,not_recommended
1,دستگاه خیلی ضعیف,من این فیس براس چند روز یپش به دستم رسید و الا...,recommended
2,خوب ولی کارایی محدود,مدل 46MM به دست شما نخواهد رسید و به جای آن مد...,recommended
3,نقد پس از خرید,سلام ، راحت شدم از کابل شارژ ، توصیه میشود به ...,not_recommended
4,نقد منصفانه,من تو تخفیف ویژه 5 تا خریدم و همشون رو هم تست ...,recommended
...,...,...,...
61,عدم بسته بندی و تحویل مناسب,خیلی نردبان خوبیه خیلی بدرد بخوره تنها نکته من...,not_recommended
62,ظاهر خراب,شکل و ظاهر محصول که خیلی خط و خش داشت و پایینش...,recommended
63,پیشنهاد نمیدم,من دوسه ماهی هست این کفشدازردیجی گرفتم متاسفان...,not_recommended
64,SanDisk,سلام وقتتون بخیر\r\nاولش که من این USB رو خرید...,recommended
