# The task here was to do a sentimental analysis and determine what was the reason for customers to give a high, neutral or a low rating.

In [None]:
# importing all the libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
import nltk, re, string, collections
from nltk.util import ngrams

In [None]:
# Loading the csv file containing all the reviews in a Dataframe
df = pd.read_csv('/content/reviews_edited.csv')

In [None]:
# Displaying the dataframe
df

Unnamed: 0.1,Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,translated_review
0,0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,
1,1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13,
2,2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24,
3,3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,I received it well before the stipulated deadl...
4,4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,Congratulations lannister stores I loved shopp...
...,...,...,...,...,...,...,...,...,...
99219,99219,574ed12dd733e5fa530cfd4bbf39d7c9,2a8c23fee101d4d5662fa670396eb8da,5,,,2018-07-07 00:00:00,2018-07-14 17:18:30,
99220,99220,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09 00:00:00,2017-12-11 20:06:42,
99221,99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43,"Excellent backpack, super fast delivery. I hig..."
99222,99222,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01 00:00:00,2018-07-02 12:59:13,


In [None]:
# Classifying the reviews by the review scores
df.loc[df['review_score'] > 3, 'Sentiment'] = 'Good'
df.loc[df['review_score'] == 3, 'Sentiment'] = 'Neutral'
df.loc[df['review_score'] < 3, 'Sentiment'] = 'Bad'

df['translated_review'] = df['translated_review'].str.lower()

## Good Sentiments

In [29]:
# Taking only the 'Positive' or 'good' reviews to a list and cleaning all null values
x = df.query("Sentiment=='Good'")["translated_review"].tolist()
cleanedList_good = [y for y in x if str(y) != 'nan']

In [30]:
# Splitting the sentences into individual words # 
word_list_good = [word for sent in cleanedList_good for word in sent.split()]

In [31]:
# Using the n-grams technique from NLTK package and saving them to a variable
grams_good = ngrams(word_list_good, 7)

In [32]:
# Counting the number of appearances of all the n-grams and saving it to a variable. The variable is now an objects of type 'collections.counter' belonging to the NLTK family
gramFreq_good = collections.Counter(grams_good)

In [33]:
# Assigning the top 10 most frequently appeared n grams to a variable 'c'
c = gramFreq_good.most_common(10)

In [34]:
# Converting the collections.counter object to a dictionary and then saving it to a csv file
formatted_data = [[tuple_str, count] for tuple_str, count in c]
dict = {}
char = ''
for list in formatted_data:
  for word in list[0]:
    char+= word + ' '
  dict[char] = list[1], 'Positive'
  char = ''
csv_file_path = 'counter_data.csv'

# Open the CSV file in write mode
with open(csv_file_path, 'w') as csvfile:
    # Write the header row
    csvfile.write('Item,Count\n')

    # Write the data from the dictionary
    for item, count in dict.items():
        csvfile.write(f'{item},{count}\n')

print(f'Dictionary data has been saved to {csv_file_path}')

Dictionary data has been saved to counter_data.csv


In [35]:
# Displaying the most commonly appeared words in 'Positive' reviews
pd.value_counts(np.array(word_list_good)).head(20)

the          16706
i            14165
and           8036
it            7655
product       7569
very          5992
of            4370
arrived       4343
was           4096
to            3716
good          3505
recommend     3436
on            3398
delivery      3182
is            3077
great         2755
delivered     2713
in            2606
a             2569
ahead         2549
dtype: int64

## Neutral Condition

In [36]:
# Taking only the 'Neutral' reviews to a list and cleaning all null values
x = df.query("Sentiment=='Neutral'")["translated_review"].tolist()
cleaned_List_neutral = [y for y in x if str(y) != 'nan']

In [37]:
# Splitting the sentences into individual words # 
word_list_neutral = [word for sent in cleaned_List_neutral for word in sent.split()]

In [38]:
# Using the n-grams technique from NLTK package and saving them to a variable
grams_neutral = ngrams(word_list_neutral, 7)

In [39]:
# Counting the number of appearances of all the n-grams and saving it to a variable. The variable is now an objects of type 'collections.counter' belonging to the NLTK family
gramFreq_neutral = collections.Counter(grams_neutral)

In [40]:
# Assigning the top 10 most frequently appeared n grams to a variable 'c'
c = gramFreq_neutral.most_common(10)

In [41]:
# Converting the collections.counter object to a dictionary and then saving it to a csv file
formatted_data = [[tuple_str, count] for tuple_str, count in c]
dict = {}
char = ''
for list in formatted_data:
  for word in list[0]:
    char+= word + ' '
  dict[char] = list[1], 'Neutral'
  char = ''
csv_file_path = 'counter_data.csv'

# Open the CSV file in write mode
with open(csv_file_path, 'a', newline='') as csvfile:
    # Write the header row
    csvfile.write('Item,Count\n')

    # Write the data from the dictionary
    for item, count in dict.items():
        csvfile.write(f'{item},{count}\n')

print(f'Dictionary data has been saved to {csv_file_path}')

Dictionary data has been saved to counter_data.csv


In [42]:
# Displaying the most commonly appeared words in 'Neutral' reviews
pd.value_counts(np.array(word_list_neutral)).head(20)

the         4953
i           2849
it          1780
and         1325
to          1282
product     1224
was         1084
a           1051
but          980
is           858
of           697
not          685
in           666
for          540
received     522
on           514
with         505
very         458
that         409
only         400
dtype: int64

## Negative Condition

In [43]:
# Taking only the 'Negative' or 'Bad' reviews to a list and cleaning all null values
x = df.query("Sentiment=='Bad'")["translated_review"].tolist()
cleaned_List_bad = [y for y in x if str(y) != 'nan']

In [44]:
# Splitting the sentences into individual words # 
word_list_bad = [word for sent in cleaned_List_bad for word in sent.split()]

In [45]:
# Using the n-grams technique from NLTK package and saving them to a variable
grams_bad = ngrams(word_list_bad, 7)

In [46]:
# Counting the number of appearances of all the n-grams and saving it to a variable. The variable is now an objects of type 'collections.counter' belonging to the NLTK family
gramFreq_bad = collections.Counter(grams_bad)

In [47]:
# Assigning the top 10 most frequently appeared n grams to a variable 'c'
c = gramFreq_bad.most_common(10)

In [48]:
# Converting the collections.counter object to a dictionary and then saving it to a csv file
formatted_data = [[tuple_str, count] for tuple_str, count in c]
dict = {}
char = ''
for list in formatted_data:
  for word in list[0]:
    char+= word + ' '
  dict[char] = list[1], 'Negative'
  char = ''
csv_file_path = 'counter_data.csv'

# Open the CSV file in write mode
with open(csv_file_path, 'a', newline='') as csvfile:
    # Write the header row
    csvfile.write('Item,Count\n')

    # Write the data from the dictionary
    for item, count in dict.items():
        csvfile.write(f'{item},{count}\n')

print(f'Dictionary data has been saved to {csv_file_path}')

Dictionary data has been saved to counter_data.csv


In [49]:
# Displaying the most commonly appeared words in 'Negative' reviews
pd.value_counts(np.array(word_list_bad)).head(20)

the         16062
i           12519
and          6662
to           4959
product      4789
it           4685
a            4174
not          4112
was          3341
received     2788
of           2409
for          2362
is           2329
my           2137
in           2004
they         1668
bought       1631
but          1613
have         1604
on           1588
dtype: int64