# EDA and Preprocessing

In [1]:
#Import Libraries

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.feature_extraction import text
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification, make_regression
from sklearn.preprocessing import StandardScaler

import re
import requests
import time
import regex as re
from collections import Counter

from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# import cleaned datasets

data = pd.read_csv("../Data/yelp_restaurant_reviews.csv")

In [4]:
''''''''''''''''''''''''''''data.columns

SyntaxError: EOF while scanning triple-quoted string literal (3268441044.py, line 1)

In [5]:
# drop columns not needed
to_drop = ['address', 'postal_code', 'latitude', 'longitude']
df = data.drop(to_drop, axis=1)

In [6]:
df.columns

Index(['business_id', 'name', 'city', 'state', 'stars', 'attributes',
       'categories', 'user_id', 'review_stars', 'text', 'date'],
      dtype='object')

In [7]:
df.shape

(3566966, 11)

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(3551890, 11)

In [10]:
df['state'].value_counts()

NV     1172468
AZ     1010516
ON      471356
NC      235474
OH      197762
PA      175843
QC      120073
WI       78930
AB       53737
IL       23564
SC       12021
NY          91
NM          14
NE           8
XWY          8
AR           7
VT           5
CA           4
WA           3
CON          3
BC           3
Name: state, dtype: int64

In [11]:
df_ON = df[df['state'].str.contains('ON')]

In [12]:
csv_ON = "../Data/ON_reviews.csv"
df_ON.to_csv(csv_ON, index=False)

In [13]:
df_ON.shape

(471359, 11)

In [14]:
df_ON.head()

Unnamed: 0,business_id,name,city,state,stars,attributes,categories,user_id,review_stars,text,date
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",2K62MJ4CJ19L8Tp5pRfjfQ,3.0,My girlfriend and I went for dinner at Emerald...,2017-01-27 21:54:30
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",A0kENtCCoVT3m7T35zb2Vg,3.0,We've always been there on a Sunday so we were...,2013-06-24 23:11:30
2,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",SuOLY03LW5ZcnynKhbTydA,3.0,"***No automatic doors, not baby friendly!*** I...",2016-01-04 12:59:22
3,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",lymyUak6KNcNKoDbK87MiQ,1.0,"Horrible service,\nI went there tonight with m...",2014-05-09 02:38:43
4,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",6vU0I5XgCv9OQHZ76rV6qw,4.0,One of the gauges of a good Chinese restaurant...,2011-03-21 14:39:55


In [15]:
ON_cities = df_ON['city'].value_counts().head(20)
ON_cities

Toronto          303058
Mississauga       35653
Markham           33280
Richmond Hill     16493
Scarborough       12548
North York        11943
Vaughan           11050
Brampton           8263
Etobicoke          7969
Thornhill          5772
Oakville           2996
Newmarket          2954
Pickering          2559
Ajax               2216
Whitby             1996
Woodbridge         1898
Aurora             1867
Unionville         1358
Concord            1113
East York           883
Name: city, dtype: int64

In [16]:
Toronto = df_ON[df_ON['city'].str.contains('Toronto')]

In [17]:
Markham = df_ON[df_ON['city'].str.contains('Markham')]

In [18]:
Mississauga = df_ON[df_ON['city'].str.contains('Mississauga')]

In [19]:
GTA_Top3 = pd.concat([Toronto, Markham, Mississauga], axis = 0)

In [20]:
GTA_Top3.head()

Unnamed: 0,business_id,name,city,state,stars,attributes,categories,user_id,review_stars,text,date
575,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",fd25NWbvLdNFJV_MoAljIg,4.0,Pretty solid vegan spot. Had a rice bowl there...,2017-10-11 00:28:10
576,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",AyxTBqPJjYqlY55vfNfB2Q,5.0,"What a great spot for a delicious, healthy lun...",2015-01-19 01:40:08
577,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",-KvhDluq5GhtiafY4xK2Sw,2.0,I really do love Bolt Fresh Bar. I've gone the...,2018-05-30 14:04:50
578,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",75DUiVsWkGZRW--zySVORA,4.0,"The vibe: Clean, bright vegetarian take out!\n...",2014-02-25 19:39:12
579,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",z23KjDT_pUtORz4ptHnUAg,2.0,I went here once to get the protein bowl and i...,2017-05-27 00:14:14


In [21]:
GTA_Top3.shape

(372028, 11)

In [22]:
GTA_Top3['business_id'].nunique()

6604

In [23]:
GTA_Top3['name'].value_counts()

Pai Northern Thai Kitchen    2177
Banh Mi Boys                 1636
Khao San Road                1467
KINKA IZAKAYA ORIGINAL       1425
KINTON RAMEN                 1288
                             ... 
Corks Beer & Wine Bars          3
Sushi Express                   3
Memories of Africa              3
Xin Yi Garden                   3
Habibi Lounge                   3
Name: name, Length: 5100, dtype: int64

In [24]:
GTA_Top3["text"].duplicated().value_counts()

False    371326
True        702
Name: text, dtype: int64

In [25]:
GTA_Top3.drop_duplicates(subset=['text'], inplace=True)

In [26]:
# Number of unique users in the GTA area

n_users = len(GTA_Top3.user_id.unique())
n_users 

91031

In [27]:
GTA_Top3['date'] = pd.to_datetime(GTA_Top3['date']).dt.date

In [28]:
GTA_Top3['date'] = pd.to_datetime(GTA_Top3['date']).dt.normalize()

In [29]:
# extract year from date column and drop original date column
GTA_Top3[['year']] = pd.DataFrame(GTA_Top3.date.dt.year)

GTA_Top3.drop(['date'], axis =1, inplace = True)
GTA_Top3.head()

Unnamed: 0,business_id,name,city,state,stars,attributes,categories,user_id,review_stars,text,year
575,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",fd25NWbvLdNFJV_MoAljIg,4.0,Pretty solid vegan spot. Had a rice bowl there...,2017
576,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",AyxTBqPJjYqlY55vfNfB2Q,5.0,"What a great spot for a delicious, healthy lun...",2015
577,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",-KvhDluq5GhtiafY4xK2Sw,2.0,I really do love Bolt Fresh Bar. I've gone the...,2018
578,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",75DUiVsWkGZRW--zySVORA,4.0,"The vibe: Clean, bright vegetarian take out!\n...",2014
579,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",z23KjDT_pUtORz4ptHnUAg,2.0,I went here once to get the protein bowl and i...,2017


In [30]:
GTA_Top3["year"].value_counts()

2017    76337
2018    68473
2016    67469
2015    55229
2014    36517
2013    23743
2012    16965
2011    13194
2010     8103
2009     4172
2008     1123
2007        1
Name: year, dtype: int64

In [31]:
GTA_clean = GTA_Top3[GTA_Top3['year'] >= 2014]

print(GTA_clean.shape)
GTA_clean.head()

(304025, 11)


Unnamed: 0,business_id,name,city,state,stars,attributes,categories,user_id,review_stars,text,year
575,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",fd25NWbvLdNFJV_MoAljIg,4.0,Pretty solid vegan spot. Had a rice bowl there...,2017
576,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",AyxTBqPJjYqlY55vfNfB2Q,5.0,"What a great spot for a delicious, healthy lun...",2015
577,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",-KvhDluq5GhtiafY4xK2Sw,2.0,I really do love Bolt Fresh Bar. I've gone the...,2018
578,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",75DUiVsWkGZRW--zySVORA,4.0,"The vibe: Clean, bright vegetarian take out!\n...",2014
579,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,Toronto,ON,3.0,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...",z23KjDT_pUtORz4ptHnUAg,2.0,I went here once to get the protein bowl and i...,2017


In [32]:
GTA_clean.to_csv('../Data/GTA_clean.csv', index = False)

#### on the retail dataset
X = retail['post']
y = retail['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

# Fit & transform the vectorizer 
Xcvec_train = cvec.fit_transform(X_train)

# Transform
Xcvec_test = cvec.transform(X_test)