# Restaurants review analysis

# Problem of statement
1. Clean data in Excel
2. Analyze ratings vs cuisines
3. Use NLP to do sentiment analysis on reviews
4. Perform ANOVA for cuisine comparison
5. Build visuals in Power BI

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

df = pd.read_csv("Restaurant_reviews.csv")
df

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,
...,...,...,...,...,...,...,...,...
9995,Chinese Pavilion,Abhishek Mahajan,Madhumathi Mahajan Well to start with nice cou...,3,"53 Reviews , 54 Followers",6/5/2016 0:08,0,
9996,Chinese Pavilion,Sharad Agrawal,This place has never disappointed us.. The foo...,4.5,"2 Reviews , 53 Followers",6/4/2016 22:01,0,
9997,Chinese Pavilion,Ramandeep,"Bad rating is mainly because of ""Chicken Bone ...",1.5,"65 Reviews , 423 Followers",6/3/2016 10:37,3,
9998,Chinese Pavilion,Nayana Shanbhag,I personally love and prefer Chinese Food. Had...,4,"13 Reviews , 144 Followers",5/31/2016 17:22,0,


# Part_1: Preprocessing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Restaurant  10000 non-null  object 
 1   Reviewer    9962 non-null   object 
 2   Review      9955 non-null   object 
 3   Rating      9962 non-null   object 
 4   Metadata    9962 non-null   object 
 5   Time        9962 non-null   object 
 6   Pictures    10000 non-null  int64  
 7   7514        1 non-null      float64
dtypes: float64(1), int64(1), object(6)
memory usage: 625.1+ KB


In [4]:
df.isnull().sum()

Unnamed: 0,0
Restaurant,0
Reviewer,38
Review,45
Rating,38
Metadata,38
Time,38
Pictures,0
7514,9999


In [5]:
df.columns

Index(['Restaurant', 'Reviewer', 'Review', 'Rating', 'Metadata', 'Time',
       'Pictures', '7514'],
      dtype='object')

In [6]:
df.dtypes

Unnamed: 0,0
Restaurant,object
Reviewer,object
Review,object
Rating,object
Metadata,object
Time,object
Pictures,int64
7514,float64


In [7]:
df.duplicated().sum()

np.int64(36)

In [8]:
df.describe()

Unnamed: 0,Pictures,7514
count,10000.0,1.0
mean,0.7486,2447.0
std,2.570381,
min,0.0,2447.0
25%,0.0,2447.0
50%,0.0,2447.0
75%,0.0,2447.0
max,64.0,2447.0


In [9]:
df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,


In [10]:
df['Restaurant'].value_counts()

Unnamed: 0_level_0,count
Restaurant,Unnamed: 1_level_1
Beyond Flavours,100
Paradise,100
Flechazo,100
Shah Ghouse Hotel & Restaurant,100
Over The Moon Brew Company,100
...,...
Desi Bytes,100
Hyderabadi Daawat,100
Zega - Sheraton Hyderabad Hotel,100
Triptify,100


In [11]:
df.drop('7514', axis=1, inplace=True)
#Drop the null values from reviewer and review columns
df.dropna(subset=['Reviewer', 'Review'], inplace=True)

In [12]:
df['Rating'] = df['Rating'].replace('Like', 1.5)
df['Rating'] = df['Rating'].astype(float)

In [13]:
df['Time'] = pd.to_datetime(df['Time'])

In [14]:
# Example: assuming your column is named 'Metadata'
df[['no_of_reviews', 'followers']] = df['Metadata'].str.extract(r'(\d+) Review[s]? , (\d+) Follower[s]?')

# Convert the extracted values to integers
df['no_of_reviews'] = df['no_of_reviews'].fillna(0).astype(int)
df['followers'] = df['followers'].fillna(0).astype(int)

df.drop('Metadata', axis=1, inplace=True)

In [15]:
df.dtypes

Unnamed: 0,0
Restaurant,object
Reviewer,object
Review,object
Rating,float64
Time,datetime64[ns]
Pictures,int64
no_of_reviews,int64
followers,int64


# Exploratory Data Analysis (EDA)

In [16]:
df.columns

Index(['Restaurant', 'Reviewer', 'Review', 'Rating', 'Time', 'Pictures',
       'no_of_reviews', 'followers'],
      dtype='object')

In [17]:
#VADER lexicon
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

#classify sentiment
def get_sentiment(text):
  score = sia.polarity_scores(str(text))['compound']
  if score >= 0.05:
    return 'Positive'
  elif score <= -0.05:
    return 'Negative'
  else:
    return 'Neutral'

df['Sentiment'] = df['Review'].apply(get_sentiment)
#Show output
df[['Review', 'Sentiment']].head(5)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Review,Sentiment
0,"The ambience was good, food was quite good . h...",Positive
1,Ambience is too good for a pleasant evening. S...,Positive
2,A must try.. great food great ambience. Thnx f...,Positive
3,Soumen das and Arun was a great guy. Only beca...,Positive
4,Food is good.we ordered Kodi drumsticks and ba...,Positive


In [20]:
df.columns

Index(['Restaurant', 'Reviewer', 'Review', 'Rating', 'Time', 'Pictures',
       'no_of_reviews', 'followers', 'Sentiment'],
      dtype='object')

In [21]:
from scipy import stats
import numpy as np

# Clean and prep data
df = df.dropna(subset=['Rating', 'Sentiment', 'Review'])

# Create Review Length
df['Review_Length'] = df['Review'].apply(lambda x: len(str(x).split()))

# Create Rating Category for Chi-Square
def categorize_rating(r):
    if r <= 2:
        return 'Low'
    elif r <= 4:
        return 'Medium'
    else:
        return 'High'

df['Rating_Category'] = df['Rating'].apply(categorize_rating)

# Group data by Sentiment for ANOVA on Rating
grouped_rating = [group['Rating'].values for name, group in df.groupby('Sentiment')]

# ANOVA Test on Rating
anova_rating_result = stats.f_oneway(*grouped_rating)

# Group data by Sentiment for ANOVA on Review Length
grouped_length = [group['Review_Length'].values for name, group in df.groupby('Sentiment')]

# ANOVA Test on Review Length
anova_length_result = stats.f_oneway(*grouped_length)

# Chi-Square Test between Sentiment and Rating Category
contingency_table = pd.crosstab(df['Sentiment'], df['Rating_Category'])
chi2_result = stats.chi2_contingency(contingency_table)

anova_rating_result, anova_length_result, chi2_result

(F_onewayResult(statistic=np.float64(4270.411879395762), pvalue=np.float64(0.0)),
 F_onewayResult(statistic=np.float64(179.90367779432316), pvalue=np.float64(1.7703864603389193e-77)),
 Chi2ContingencyResult(statistic=np.float64(4750.7177375433785), pvalue=np.float64(0.0), dof=4, expected_freq=array([[ 717.57207433,  447.49231542,  668.93561025],
        [ 291.48920141,  181.77850326,  271.73229533],
        [2885.93872426, 1799.72918132, 2690.33209442]])))