In [9]:
import pandas as pd
import numpy as np
import string
import regex as re
import random
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [3]:
data_reviews = pd.read_csv("English_Reviews.csv")
data = pd.read_csv("./Dataset/Filtered_data.csv")

In [4]:
data['product_category_name_english'].value_counts().head(10)

bed_bath_table           11087
health_beauty             9641
sports_leisure            8624
furniture_decor           8315
computers_accessories     7801
housewares                6945
watches_gifts             5968
telephony                 4517
garden_tools              4336
auto                      4226
Name: product_category_name_english, dtype: int64

In [6]:
data_reviews = data_reviews[data_reviews['english_comment'] != 'emp']
data_reviews = data_reviews.reset_index(drop=True)

In [7]:
len(data_reviews.index)

41739

In [10]:
for i,v in data_reviews.iterrows():
    val = data_reviews['english_comment'].values[i].replace('�','')
    val = val.lower()
    val = val.split()
    tab = str.maketrans('','',string.digits)
    val = [w.translate(tab) for w in val]
    tab = str.maketrans('','',string.punctuation)
    val = [w.translate(tab) for w in val]
    words = [w for w in val if not w in stop_words]
    val = ' '.join([token.lemma_ for wd in words for token in nlp(wd)])
    data_reviews.at[i,'clean_comment'] = val

In [12]:
sid = SentimentIntensityAnalyzer()
data_reviews['scores'] = data_reviews['clean_comment'].apply(lambda review: sid.polarity_scores(review))

data_reviews['compound']  = data_reviews['scores'].apply(lambda score_dict: score_dict['compound'])

data_reviews['comp_score'] = data_reviews['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

data_reviews.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,english_comment,clean_comment,scores,compound,comp_score
0,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,21/04/2017 00:00,21/04/2017 22:02,I received well before the stipulated deadline.,receive well stipulate deadline,"{'neg': 0.0, 'neu': 0.588, 'pos': 0.412, 'comp...",0.2732,pos
1,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parab�����������������������������������������...,01/03/2018 00:00,02/03/2018 10:26,Congratulations lannister shops I loved to buy...,congratulation lannister shop love buy interne...,"{'neg': 0.0, 'neu': 0.353, 'pos': 0.647, 'comp...",0.9001,pos


In [13]:
data_order = pd.read_csv("./Dataset/olist_order_items_dataset.csv")
data_cat_name = pd.read_csv("./Dataset/olist_products_dataset.csv")
data_cat_name_english = pd.read_csv("./Dataset/product_category_name_translation.csv")

In [14]:
data_orderid = pd.merge(data_order, data_reviews, on='order_id')
data_order_prodid = pd.merge(data_orderid, data_cat_name, on='product_id')
data_prodcat_review = pd.merge(data_order_prodid, data_cat_name_english, on='product_category_name')
data_prodcat_review.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,review_id,review_score,review_comment_title,...,comp_score,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,97ca439bc427b48bc1cd7177abe71365,5,,...,pos,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0,cool_stuff
1,130898c0987d1801452a8ed92a670612,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-07-05 02:44:11,55.9,17.96,b11cba360bbe71410c291b764753d37f,5,,...,pos,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0,cool_stuff


In [15]:
neg_reviews = data_prodcat_review[data_prodcat_review['comp_score'] == 'neg']
len(neg_reviews.index)

4453

In [16]:
total_sales = data['price'].sum()
costliest = data.groupby('product_category_name_english')['price'].sum().sort_values(ascending = False).head(10)
(costliest/total_sales)*100

product_category_name_english
health_beauty            9.412407
watches_gifts            8.986177
bed_bath_table           7.756310
sports_leisure           7.396061
computers_accessories    6.793716
furniture_decor          5.457622
housewares               4.725536
cool_stuff               4.711356
auto                     4.432905
garden_tools             3.617572
Name: price, dtype: float64

In [17]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

## Category bed_bath_table

In [18]:
dtm = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'bed_bath_table']['clean_comment'])
LDA = LatentDirichletAllocation(n_components=10,random_state=42)
LDA.fit(dtm)
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['pay', 'want', 'wait', 'store', 'order', 'buy', 'curtain', 'product', 'miss', 'receive']


THE TOP 10 WORDS FOR TOPIC #1
['wrong', 'ask', 'color', 'size', 'bad', 'baratheon', 'material', 'low', 'quality', 'product']


THE TOP 10 WORDS FOR TOPIC #2
['success', 'problem', 'say', 'request', 'deadline', 'mail', 'time', 'deliver', 'delivery', 'delay']


THE TOP 10 WORDS FOR TOPIC #3
['send', 'buy', 'disappointed', 'come', 'product', 'problem', 'different', 'miss', 'time', 'purchase']


THE TOP 10 WORDS FOR TOPIC #4
['lack', 'cloth', 'glass', 'receive', 'come', 'unit', 'buy', 'want', 'product', 'pay']


THE TOP 10 WORDS FOR TOPIC #5
['problem', 'like', 'delivery', 'office', 'receive', 'post', 'product', 'freight', 'pay', 'buy']


THE TOP 10 WORDS FOR TOPIC #6
['fabric', 'bed', 'len', 'quilt', 'come', 'receive', 'game', 'buy', 'product', 'miss']


THE TOP 10 WORDS FOR TOPIC #7
['miss', 'come', 'disappointed', 'separate', 'gray', 'send', 'merchandise', 'color', '

In [43]:
cat = 'bed_bath_table'
negative_words = ['product','deliver', 'wait', 'buy', 'miss', 'wrong', 'bad', 'low', 'problem', 'request', 'deadline', 'time', 'deliver',
        'delay', 'dissapointed', 'send', 'purchase', 'lack', 'want', 'separate']
def issues(cat, *w):    
    d = {}
    neg_cat = neg_reviews[neg_reviews['product_category_name_english'] == cat]
    for i in range(len(w)):
        res = neg_cat[(neg_cat['clean_comment'].str.match(w[i])==True)]
        t = int(res['clean_comment'].count())
        #print(w[i],t)
        for j in range(i+1,len(w)):
            r = neg_cat[(neg_cat['clean_comment'].str.match(w[i])==True) & (neg_cat['clean_comment'].str.match(w[j])==True)]
            c = int(r['clean_comment'].count())
            #print(w[j],c)
            t -= c
        d.update({w[i] : t})        
    return d

def filter_words(dic):
    sorted_dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
    lst = [k for k,v in sorted_dic.items() if v > 10]
    return lst

def output(cat, *words):
    for word in words:
        a = neg_reviews[neg_reviews['product_category_name_english'] == cat]
        b = a[a['clean_comment'].str.match(word)==True]
        b.to_csv('./Result/' + cat + '_' + word + '.csv')

dic = issues(cat, *negative_words) 
words = filter_words(dic)
output(cat,*words)

## Category furniture_decor

In [44]:
dtm1 = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'furniture_decor']['clean_comment'])
LDA1 = LatentDirichletAllocation(n_components=10,random_state=42)
LDA1.fit(dtm1)
for index,topic in enumerate(LDA1.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['date', 'defective', 'address', 'answer', 'wait', 'correct', 'come', 'upset', 'deliver', 'product']


THE TOP 10 WORDS FOR TOPIC #1
['door', 'complain', 'buy', 'chandelier', 'receive', 'problem', 'come', 'site', 'photo', 'product']


THE TOP 10 WORDS FOR TOPIC #2
['ask', 'answer', 'want', 'wrong', 'store', 'buy', 'receive', 'miss', 'come', 'product']


THE TOP 10 WORDS FOR TOPIC #3
['unfortunately', 'luminary', 'disappointed', 'complain', 'receive', 'crumple', 'purchase', 'buy', 'delay', 'come']


THE TOP 10 WORDS FOR TOPIC #4
['pay', 'receive', 'open', 'come', 'product', 'arrive', 'complaint', 'buy', 'curtain', 'miss']


THE TOP 10 WORDS FOR TOPIC #5
['come', 'merchandise', 'miss', 'purchase', 'unit', 'freight', 'product', 'receive', 'pay', 'buy']


THE TOP 10 WORDS FOR TOPIC #6
['purchase', 'know', 'touch', 'make', 'pay', 'deliver', 'difficult', 'delay', 'store', 'product']


THE TOP 10 WORDS FOR TOPIC #7
['send', 'color', 'invoice', 'possible', 'qualit

In [45]:
cat = 'furniture_decor'
negative_words = ['crumple', 'complain', 'buy', 'defective', 'unfortunately', 'deliver', 'product', 'pay', 'problem', 'manage', 'miss', 
         'send', 'pendant', 'receive', 'request', 'delivery', 'want', 'wait', 'time', 'arrive', 'break', 
         'delay', 'complaint', 'cancel', 'disappointed', 'difficult', 'different', 'low', 'bad', 'wrong', 'date', 'arrive']

dic = issues(cat, *negative_words)
words = filter_words(dic)
output(cat,*words)

## Category computers_accessories

In [46]:
dtm2 = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'computers_accessories']['clean_comment'])
LDA2 = LatentDirichletAllocation(n_components=10,random_state=42)
LDA2.fit(dtm2)
for index,topic in enumerate(LDA2.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['delivery', 'miss', 'lannister', 'cancel', 'paint', 'deliver', 'send', 'damage', 'come', 'product']


THE TOP 10 WORDS FOR TOPIC #1
['arrive', 'xl', 'receive', 'leave', 'cartridge', 'mail', 'hp', 'buy', 'disappointed', 'printer']


THE TOP 10 WORDS FOR TOPIC #2
['despite', 'cable', 'correct', 'recommend', 'supplier', 'defect', 'come', 'cancel', 'product', 'purchase']


THE TOP 10 WORDS FOR TOPIC #3
['post', 'want', 'complain', 'receive', 'delivery', 'pay', 'toner', 'day', 'product', 'problem']


THE TOP 10 WORDS FOR TOPIC #4
['fake', 'buy', 'falsify', 'delivery', 'request', 'deliver', 'cartridge', 'miss', 'come', 'product']


THE TOP 10 WORDS FOR TOPIC #5
['different', 'email', 'store', 'arrive', 'wait', 'problem', 'send', 'product', 'delivery', 'delay']


THE TOP 10 WORDS FOR TOPIC #6
['black', 'buy', 'product', 'request', 'deliver', 'color', 'cartridge', 'wrong', 'xl', 'come']


THE TOP 10 WORDS FOR TOPIC #7
['ask', 'purchase', 'wrong', 'original', 'pay

In [47]:
cat = 'computers_accessories'
words = ['arrive', 'wrong', 'damage', 'deliver', 'product', 'request', 'disappointed', 'miss', 'cancel', 'delivery', 'delay', 
         'waste', 'remove', 'time', 'post', 'request', 'different', 'low', 'pay', 'faulty', 'want', 'defect', 'defective', 'bad', 
         'ill']
dic = issues(cat, *negative_words)
words = filter_words(dic)
output(cat,*words)

## Category health_beauty

In [48]:
dtm3 = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'health_beauty']['clean_comment'])
LDA3 = LatentDirichletAllocation(n_components=10,random_state=42)
LDA3.fit(dtm3)
for index,topic in enumerate(LDA3.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['kit', 'expensive', 'miss', 'make', 'purchase', 'product', 'pay', 'request', 'disappointed', 'receive']


THE TOP 10 WORDS FOR TOPIC #1
['brush', 'want', 'company', 'unfortunately', 'invoice', 'send', 'arrive', 'use', 'box', 'bad']


THE TOP 10 WORDS FOR TOPIC #2
['arrive', 'little', 'ml', 'receive', 'miss', 'color', 'kit', 'unit', 'product', 'buy']


THE TOP 10 WORDS FOR TOPIC #3
['end', 'receive', 'device', 'work', 'difficult', 'buy', 'unfortunately', 'product', 'miss', 'scissor']


THE TOP 10 WORDS FOR TOPIC #4
['loose', 'vicete', 'lack', 'miss', 'scissor', 'come', 'complain', 'quality', 'deliver', 'product']


THE TOP 10 WORDS FOR TOPIC #5
['color', 'buy', 'deliver', 'send', 'miss', 'site', 'receive', 'wrong', 'come', 'product']


THE TOP 10 WORDS FOR TOPIC #6
['package', 'deliver', 'freight', 'store', 'lannister', 'miss', 'receive', 'pay', 'buy', 'product']


THE TOP 10 WORDS FOR TOPIC #7
['arrive', 'pay', 'purchase', 'abusive', 'value', 'try', 'long

In [49]:
cat = 'health_beauty'
words = ['wait', 'deliver', 'receive', 'wrong', 'product', 'leak', 'arrrive', 'bad', 'delivery', 'delay', 'prica', 'miss', 'defect', 
         'deadline', 'time', 'defective', 'different', 'abusive', 'try', 'long', 'cancel', 'unfortunately']
dic = issues(cat, *negative_words)
words = filter_words(dic)
output(cat,*words)

## Category sports_leisure

In [50]:
dtm4 = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'sports_leisure']['clean_comment'])
LDA4 = LatentDirichletAllocation(n_components=10,random_state=42)
LDA4.fit(dtm4)
for index,topic in enumerate(LDA4.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['jump', 'think', 'buy', 'dissatisfied', 'kit', 'freight', 'receive', 'product', 'miss', 'pay']


THE TOP 10 WORDS FOR TOPIC #1
['inside', 'unfortunately', 'come', 'box', 'tube', 'dot', 'polka', 'miss', 'product', 'lack']


THE TOP 10 WORDS FOR TOPIC #2
['expectation', 'stark', 'make', 'delay', 'commitment', 'targaryen', 'ill', 'horrible', 'lot', 'seller']


THE TOP 10 WORDS FOR TOPIC #3
['miss', 'need', 'error', 'send', 'product', 'want', 'buy', 'request', 'disappointed', 'cancel']


THE TOP 10 WORDS FOR TOPIC #4
['fact', 'deliver', 'buy', 'know', 'delivery', 'cancel', 'make', 'miss', 'say', 'product']


THE TOP 10 WORDS FOR TOPIC #5
['request', 'solve', 'purchase', 'buy', 'envelope', 'penalty', 'deadline', 'deliver', 'product', 'problem']


THE TOP 10 WORDS FOR TOPIC #6
['buy', 'come', 'complain', 'day', 'office', 'post', 'arrive', 'product', 'delay', 'delivery']


THE TOP 10 WORDS FOR TOPIC #7
['slam', 'kg', 'medicine', 'request', 'buy', 'send', 'produc

In [51]:
cat = 'sports_leisure'
words = ['dissatisfied', 'miss', 'product', 'complain', 'delay', 'expect', 'deliver', 'confess', 'disappoint', 'error', 
         'disappointed', 'cancel', 'penalty', 'delivery', 'wrong', 'post', 'defect', 'problem', 'low', 'lack', 'rage']
dic = issues(cat, *negative_words)
words = filter_words(dic)
output(cat,*words)

## Category watches_gifts

In [52]:
dtm5 = cv.fit_transform(neg_reviews[neg_reviews['product_category_name_english'] == 'watches_gifts']['clean_comment'])
LDA5 = LatentDirichletAllocation(n_components=10,random_state=42)
LDA5.fit(dtm5)
for index,topic in enumerate(LDA5.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['request', 'office', 'post', 'stark', 'dissatisfied', 'wait', 'deadline', 'disappointed', 'product', 'delivery']


THE TOP 10 WORDS FOR TOPIC #1
['buy', 'post', 'office', 'watch', 'send', 'miss', 'pay', 'defect', 'come', 'product']


THE TOP 10 WORDS FOR TOPIC #2
['recommend', 'term', 'negative', 'receive', 'buy', 'merchandise', 'unfortunately', 'delivery', 'problem', 'delay']


THE TOP 10 WORDS FOR TOPIC #3
['bracelet', 'cancellation', 'ask', 'speak', 'email', 'mistake', 'lack', 'delivery', 'send', 'pay']


THE TOP 10 WORDS FOR TOPIC #4
['miss', 'appear', 'little', 'request', 'email', 'clock', 'watch', 'buy', 'deliver', 'product']


THE TOP 10 WORDS FOR TOPIC #5
['wrong', 'letter', 'come', 'deliver', 'request', 'defective', 'warn', 'mail', 'receive', 'product']


THE TOP 10 WORDS FOR TOPIC #6
['post', 'office', 'come', 'ill', 'unit', 'happen', 'need', 'know', 'miss', 'buy']


THE TOP 10 WORDS FOR TOPIC #7
['request', 'pay', 'ask', 'return', 'buy', 'arriv

In [53]:
cat = 'watches_gifts'
words = ['deadline', 'wait', 'disappointed', 'post', 'miss', 'cancel', 'delivery', 'product', 'low', 'change', 'wrong', 
         'defective', 'unfortunately', 'deliver', 'problem', 'warn', 'request', 'delay', 'exchange', 'defect', 'time', 'return',
         'bad', 'badly', 'dissatisfied']
dic = issues(cat, *negative_words)
words = filter_words(dic)
output(cat,*words)