## Customer Sentiment Analysis 

In [2]:
import numpy as np  
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
%matplotlib inline
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
cf.go_offline()

import plotly.graph_objs as go
from plotly.subplots  import make_subplots


import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

pd.set_option('display.max_columns',None)

In [3]:
# reading dataset
df = pd.read_csv("amazon.csv") 


In [4]:
df = df.sort_values(by="wilson_lower_bound", ascending=False)
# deleted unnecessary columns
df.drop('Unnamed: 0',inplace=True,axis=1)

In [5]:
# function to handle missing values
def missing_value_analysis(df):
    na_columns = [col for col in df.columns if df[col].isnull().sum()>0]
    n_miss = df[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (df[na_columns].isnull().sum() / df.shape[0]*100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=['Missing Values','Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df




# function to describe the dataset
def check_dataframe(df,head=5,tail=5):
    print('SHAPE'.center(82,'~'))
    print("ROWS : {}".format(df.shape[0]))
    print('COls : {}'.format(df.shape[1]))
    print("TYPES".center(82,'~'))
    print(df.dtypes)
    print("".center(82,'~'))
    print(missing_value_analysis(df))
    print("DUPLICATED VALUES".center(82,"~"))
    print(df.duplicated().sum())
    numeric_cols = df.select_dtypes(include=[np.number])
    print(numeric_cols.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
#     print("QUANTILES".center(82 , '~'))
#     print(df.quantile([0,0.05,0.50,0.95,0.99,1]).T)

    
check_dataframe(df )

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SHAPE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ROWS : 4915
COls : 11
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~TYPES~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reviewerName             object
overall                   int64
reviewText               object
reviewTime               object
day_diff                  int64
helpful_yes               int64
helpful_no                int64
total_vote                int64
score_pos_neg_diff        int64
score_average_rating    float64
wilson_lower_bound      float64
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              Missing Values  Ratio
reviewerName               1   0.02
reviewText                 1   0.02
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DUPLICATED VALUES~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~QUANTILES~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
# unique values 
def check_class(df):
    nunique_df = pd.DataFrame({'Variable':df.columns,
                              'classes':[df[i].nunique() \
                                        for i in df.columns]})
    nunique_df = nunique_df.sort_values('classes', ascending=False)
    nunique_df = nunique_df.reset_index(drop=True)
    return nunique_df

check_class(df)

In [None]:
constraints = ['#B34D22','#EBE00C','#1FEB0C','#0C92EB','#EB0CD5']  # various colors for graph
# function to show countplot and Piechart visualization of any Categorical column 
def catagorical_variable_summary(df,col_name):
    fig = make_subplots(rows=1, cols=2,subplot_titles=('Countplot','Percentage'),
                       specs=[[{'type':'xy'},{'type':'domain'}]])
    fig.add_trace(go.Bar(y =df[col_name].value_counts().values.tolist(),
                        x = [str(i) for i in df[col_name].value_counts().index],
                        text=df[col_name].value_counts().values.tolist(),
                        textfont=dict(size=14),
                        name=col_name,
                        textposition='auto',
                        showlegend=False,
                        marker=dict(color=constraints,line=dict(color="#DBE6EC",
                                                                width=1))
                        ), row=1,col=1)
    
    fig.add_trace(go.Pie(labels=df[col_name].value_counts().keys(),
                        values=df[col_name].value_counts().values,
                        textfont=dict(size=18),
                        textposition='auto',
                        showlegend=False,
                        name=col_name,
                        marker=dict(colors=constraints)),
                 row=1,col=2)
    fig.update_layout(title={"text":col_name,
                            "y":0.9,
                            "x":0.5,
                            "xanchor":"center",
                            "yanchor":"top"},
                     template = 'plotly_white')
    iplot(fig)


    

In [None]:
catagorical_variable_summary(df,'overall')

* Approximately 3922 customers, or 79.8%, have awarded Amazon items a rating of 5, indicating that they are generally happy with the service they have received.

In [None]:
# data cleaning for review column
df.reviewText.head()

In [None]:
# random sample row
review_example = df.reviewText[2031]
review_example

In [None]:
# cleaning using regular expression
review_example = re.sub("[^a-zA-Z]",' ',review_example)
review_example

In [None]:
# text to lowercase 
review_example = review_example.lower().split()
review_example

In [None]:
# lambda function to clean 'review' column of dataset
rt = lambda x:re.sub("[^a-zA-Z]",' ',str(x))
df["reviewText"] = df["reviewText"].map(rt)
df["reviewText"] = df["reviewText"].str.lower()

# printing
df.head()

In [None]:
# imported SentimentIntensityAnalyzer to obtain polarity score i.e {'Positive', 'Negative' , 'Neutral'}
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

df[['polarity','subjectivity']] = df['reviewText'].apply(lambda Text:pd.Series(TextBlob(Text).sentiment))


analyzer = SentimentIntensityAnalyzer()
for index, row in df.iterrows():
    score = analyzer.polarity_scores(row['reviewText'])
    neg = score['neg'] # closer to 1 positive and closer to 0 negative
    neu = score['neu']
    pos = score['pos']
    if neg>pos:
        df.loc[index,'sentiment'] = "Negative"
    elif pos>neg:
        df.loc[index,"sentiment"] = "Positive"
    else:
        df.loc[index,"sentiment"] = "Neutral"

In [None]:
df.head()  # checking sentiment of customers for product reviews

In [None]:
catagorical_variable_summary(df,'sentiment')

* 81.3% of reviews are favourable, indicating that customers are satisfied with Amazon's service.