In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob
import seaborn as sns
from wordcloud import WordCloud
import glob

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

# spacy
import spacy
from nltk.corpus import stopwords

# viz
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
# add stop words
# nltk.download('stopwords')
# nltk.download('wordnet')
# stop_words = stopwords.words('english')
# stop_words.append('be')
# stop_words.append('do')
# stop_words.append('very')
# stop_words.append('have')

In [3]:
# using these parts of speech result in the best results
def lemmatization(texts, allowed_postags=["NOUN","ADJ","VERB","ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser","ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ". join(new_text)
        texts_out.append(final)
    return texts_out

In [4]:
trans_reviews = pd.read_csv('olist_order_reviews_dataset_with_translation.csv',error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
trans_reviews = trans_reviews[trans_reviews['review_comment_message_english'].notna()]
trans_reviews

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_comment_title_english,review_comment_message_english
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,,I received well before the stipulated deadline.
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,,Congratulations LANNISTER stores I loved to bu...
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47,I recommend,efficient device. on the site the mark of the ...
12,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",2018-02-16 00:00:00,2018-02-20 10:52:22,,"But a little, catching ... for the value is go..."
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01,Super recommend,"Reliable seller, product ok and delivery befor..."
...,...,...,...,...,...,...,...,...,...
99205,98fffa80dc9acbde7388bef1600f3b15,d398e9c82363c12527f71801bf0e6100,4,,para este produto recebi de acordo com a compr...,2017-11-29 00:00:00,2017-11-30 15:52:51,,For this product I received according to the p...
99208,df5fae90e85354241d5d64a8955b2b09,509b86c65fe4e2ad5b96408cfef9755e,5,,Entregou dentro do prazo. O produto chegou em ...,2018-02-07 00:00:00,2018-02-19 19:47:23,,Delivered within the deadline. The product has...
99215,a709d176f59bc3af77f4149c96bae357,d5cb12269711bd1eaf7eed8fd32a7c95,3,,"O produto não foi enviado com NF, não existe v...",2018-05-19 00:00:00,2018-05-20 21:51:06,,"The product was not sent with NF, there is no ..."
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43,,"Excellent backpack, super fast delivery. Super..."


In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
trans_reviews['review_comment_message_english'] = trans_reviews['review_comment_message_english'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [7]:
trans_reviews['review_comment_message_english']

3                     I received well stipulated deadline.
4        Congratulations LANNISTER stores I loved buy i...
9        efficient device. site mark device printed 3de...
12                    But little, catching ... value good.
15              Reliable seller, product ok delivery time.
                               ...                        
99205      For product I received according purchase held!
99208    Delivered within deadline. The product reached...
99215    The product sent NF, sale without NF, I'm sure...
99221    Excellent backpack, super fast delivery. Super...
99223    My product arrived I already return, defective...
Name: review_comment_message_english, Length: 40950, dtype: object

In [8]:
trans_reviews['review_comment_message_english'] = trans_reviews['review_comment_message_english'].astype(str)
trans_reviews['processed_reviews'] = lemmatization(trans_reviews['review_comment_message_english'])

In [9]:
# remove stop words
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

trans_reviews['processed_reviews'] = gen_words(trans_reviews['processed_reviews'])
trans_reviews['processed_reviews']

3                     [receive, well, stipulate, deadline]
4        [congratulation, store, love, buy, internet, s...
9        [efficient, device, site, mark, device, print,...
12                            [little, catch, value, good]
15         [reliable, seller, product, ok, delivery, time]
                               ...                        
99205           [product, receive, accord, purchase, hold]
99208    [deliver, deadline, product, reach, perfect, c...
99215    [product, send, sale, be, sure, be, wait, send...
99221    [excellent, backpack, super, fast, delivery, s...
99223    [product, arrive, already, return, defective, ...
Name: processed_reviews, Length: 40950, dtype: object

In [10]:
# bigrams and trigrams, so we can recognise a collection of words
bigram_phrases = gensim.models.Phrases(trans_reviews['processed_reviews'], min_count=5,threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[trans_reviews['processed_reviews']],threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return ([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(trans_reviews['processed_reviews'])
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [11]:
# TF-IDF removal, remove low value words
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [41]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1),
  (14, 1),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1)],
 [(23, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)],
 [(1, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)],
 [(38, 1)],
 [(12, 1), (39, 1)],
 [(40, 2), (41, 1)],
 [(0, 1),
  (27, 1),
  (29, 1),
  (37, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1)],
 [(53, 1), (54, 1), (55, 1), (56, 1)],
 [(4, 1), (57, 1), (58, 1), (59, 1)],
 [(0, 1),
  (1, 1),
  (4, 1),
  (8, 1),
  (27, 1),
  (29, 1),
  (33, 1),
  (42, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1)],
 [(31, 1), (32, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1)],
 [(72, 1)],
 [(24, 1)],
 [(9, 1), (30, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1)],
 [(70, 1), (7

In [12]:
# train the topic model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model,corpus,id2word,mds="mmds",R=20)
vis

  default_term_info = default_term_info.sort_values(


In [53]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=trans_reviews['review_comment_message_english']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    review_score = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=trans_reviews['review_comment_message_english'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6.0,0.2772,"product, good, deadline, deliver, arrive, reco...",
1,1,6.0,0.2424,"product, good, deadline, deliver, arrive, reco...",
2,2,8.0,0.3161,"time, site, package, seller, correct, ok, pack...",
3,3,6.0,0.2772,"product, good, deadline, deliver, arrive, reco...",I received well stipulated deadline.
4,4,6.0,0.2587,"product, good, deadline, deliver, arrive, reco...",Congratulations LANNISTER stores I loved buy i...
5,5,6.0,0.233,"product, good, deadline, deliver, arrive, reco...",
6,6,6.0,0.2704,"product, good, deadline, deliver, arrive, reco...",
7,7,6.0,0.26,"product, good, deadline, deliver, arrive, reco...",
8,8,6.0,0.2503,"product, good, deadline, deliver, arrive, reco...",
9,9,1.0,0.2909,"purchase, store, make, already, give, far, lon...",efficient device. site mark device printed 3de...


In [54]:
df_dominant_topic

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6.0,0.2772,"product, good, deadline, deliver, arrive, reco...",
1,1,6.0,0.2424,"product, good, deadline, deliver, arrive, reco...",
2,2,8.0,0.3161,"time, site, package, seller, correct, ok, pack...",
3,3,6.0,0.2772,"product, good, deadline, deliver, arrive, reco...",I received well stipulated deadline.
4,4,6.0,0.2587,"product, good, deadline, deliver, arrive, reco...",Congratulations LANNISTER stores I loved buy i...
...,...,...,...,...,...
64976,99205,,,,For product I received according purchase held!
64977,99208,,,,Delivered within deadline. The product reached...
64978,99215,,,,"The product sent NF, sale without NF, I'm sure..."
64979,99221,,,,"Excellent backpack, super fast delivery. Super..."


In [55]:
trans_reviews

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_comment_title_english,review_comment_message_english,processed_reviews
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,,I received well stipulated deadline.,"[receive, well, stipulate, deadline]"
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,,Congratulations LANNISTER stores I loved buy i...,"[congratulation, store, love, buy, internet, s..."
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47,I recommend,efficient device. site mark device printed 3de...,"[efficient, device, site, mark, device, print,..."
12,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",2018-02-16 00:00:00,2018-02-20 10:52:22,,"But little, catching ... value good.","[little, catch, value, good]"
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01,Super recommend,"Reliable seller, product ok delivery time.","[reliable, seller, product, ok, delivery, time]"
...,...,...,...,...,...,...,...,...,...,...
99205,98fffa80dc9acbde7388bef1600f3b15,d398e9c82363c12527f71801bf0e6100,4,,para este produto recebi de acordo com a compr...,2017-11-29 00:00:00,2017-11-30 15:52:51,,For product I received according purchase held!,"[product, receive, accord, purchase, hold]"
99208,df5fae90e85354241d5d64a8955b2b09,509b86c65fe4e2ad5b96408cfef9755e,5,,Entregou dentro do prazo. O produto chegou em ...,2018-02-07 00:00:00,2018-02-19 19:47:23,,Delivered within deadline. The product reached...,"[deliver, deadline, product, reach, perfect, c..."
99215,a709d176f59bc3af77f4149c96bae357,d5cb12269711bd1eaf7eed8fd32a7c95,3,,"O produto não foi enviado com NF, não existe v...",2018-05-19 00:00:00,2018-05-20 21:51:06,,"The product sent NF, sale without NF, I'm sure...","[product, send, sale, be, sure, be, wait, send..."
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43,,"Excellent backpack, super fast delivery. Super...","[excellent, backpack, super, fast, delivery, s..."


In [None]:
review_score_1

In [None]:
trans_reviews['review_comment_message_english'].isna().sum()

In [None]:
allWords = ' '.join([reviews for reviews in review_score_1['review_comment_message_english']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=119).generate(allWords)

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
trans_reviews.dropna(subset = ['review_comment_message_english'], inplace=True)

In [None]:
trans_reviews['polarity'] = trans_reviews['review_comment_message_english'].apply(lambda x: TextBlob(x).sentiment[0])
trans_reviews['subjectivity'] = trans_reviews['review_comment_message_english'].apply(lambda x: TextBlob(x).sentiment[1])

In [None]:
trans_reviews.head()

In [None]:
trans_reviews.head()

In [None]:
sns.displot(trans_reviews['polarity'], kde=False,bins=40) # kde=False removes kernel density line

In [None]:
allWords = ' '.join([reviews for reviews in trans_reviews['review_comment_message_english']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=119).generate(allWords)

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()
                     

In [None]:
reviews = pd.read_csv('olist_order_reviews_dataset.csv')

In [None]:
reviews

In [None]:
sns.displot(reviews['review_score'], kde=False,bins=5)