# Import Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style(style='whitegrid')
import matplotlib.pyplot as plt

In [2]:
#sets the default options for viewing pandas dataframes
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)
#pd.set_option('display.width', 100)
pd.set_option('display.max_info_columns', 50)

In [3]:
path = '/Users/dmitriykats/Documents/SpringBoard/Springboard/Capstone2/true_review/data/'

In [4]:
df = pd.read_csv(f'{path}/raw/restaurant_reviews.csv', parse_dates=['date'])

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.columns

Index(['user_id', 'business_id', 'rev_stars', 'date', 'text', 'useful',
       'funny', 'cool', 'bus_id', 'name', 'neighborhood', 'address', 'city',
       'state', 'postal_code', 'latitude', 'longitude', 'bus_stars',
       'review_count', 'is_open', 'categories'],
      dtype='object')

In [6]:
#drop the bus_id columns, it's a duplicated of business_id
df = df.drop(columns='bus_id')

In [7]:
df['weekday'] = df.date.apply(lambda x: x.weekday())

In [8]:
#add a new colomn which calculates the number of characters in a given review
df['text length'] = df['text'].apply(len)

In [9]:
#add a year column
df['year'] = df.date.apply(lambda x: x.year)

In [10]:
scottsdale = df[df.city == 'Scottsdale']

In [11]:
print(f'Number of Users: {scottsdale.user_id.unique().shape[0]}')
print(f'Number of Restaurants: {scottsdale.business_id.unique().shape[0]}')
print(f'Number of Reviews: {scottsdale.text.unique().shape[0]}')

Number of Users: 76011
Number of Restaurants: 1322
Number of Reviews: 173062


### Looking at Checkins

In [12]:
df_check = pd.read_csv(f'{path}/external/yelp_checkin.csv')

In [13]:
df_check.head()

Unnamed: 0,business_id,weekday,hour,checkins
0,3Mc-LxcqeguOXOVT_2ZtCg,Tue,0:00,12
1,SVFx6_epO22bZTZnKwlX7g,Wed,0:00,4
2,vW9aLivd4-IorAfStzsHww,Tue,14:00,1
3,tEzxhauTQddACyqdJ0OPEQ,Fri,19:00,1
4,CEyZU32P-vtMhgqRCaXzMA,Tue,17:00,1


### Users and their friends

In [14]:
df_friends = pd.read_csv(f'{path}/external/yelp_user.csv', parse_dates=['yelping_since'])

In [15]:
df_friends.head()

Unnamed: 0,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,JJ-aSuM4pCFPdkfoZ34q0Q,Chris,10,2013-09-24,"0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gpg...",0,0,0,0,,3.7,0,0,0,0,0,0,0,0,0,0,0
1,uUzsFQn_6cXDh6rPNGbIFA,Tiffy,1,2017-03-02,,0,0,0,0,,2.0,0,0,0,0,0,0,0,0,0,0,0
2,mBneaEEH5EMyxaVyqS-72A,Mark,6,2015-03-13,,0,0,0,0,,4.67,0,0,0,0,0,0,0,0,0,0,0
3,W5mJGs-dcDWRGEhAzUYtoA,Evelyn,3,2016-09-08,,0,0,0,0,,4.67,0,0,0,0,0,0,0,0,0,0,0
4,4E8--zUZO1Rr1IBK4_83fg,Lisa,11,2012-07-16,,4,0,0,0,,3.45,0,0,0,0,0,0,0,0,0,1,0


In [16]:
df_friends.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326100 entries, 0 to 1326099
Data columns (total 22 columns):
user_id               1326100 non-null object
name                  1325603 non-null object
review_count          1326100 non-null int64
yelping_since         1326100 non-null datetime64[ns]
friends               1326100 non-null object
useful                1326100 non-null int64
funny                 1326100 non-null int64
cool                  1326100 non-null int64
fans                  1326100 non-null int64
elite                 1326100 non-null object
average_stars         1326100 non-null float64
compliment_hot        1326100 non-null int64
compliment_more       1326100 non-null int64
compliment_profile    1326100 non-null int64
compliment_cute       1326100 non-null int64
compliment_list       1326100 non-null int64
compliment_note       1326100 non-null int64
compliment_plain      1326100 non-null int64
compliment_cool       1326100 non-null int64
compliment_funny 

In [17]:
#filter the friends dataframe to only include users with friends
df_friends['friends'] = df_friends.friends.apply(lambda x: np.nan if x == 'None' else x)
df_friends = df_friends.dropna()

In [18]:
df_friends.head()

Unnamed: 0,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,JJ-aSuM4pCFPdkfoZ34q0Q,Chris,10,2013-09-24,"0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gpg...",0,0,0,0,,3.7,0,0,0,0,0,0,0,0,0,0,0
17,h5ERTYn2vQ1QbjTZvfWPaA,Bobby,3,2017-01-07,"jYiZnueCr7gVq9T34xoa7g, yFLXGdY6rpHt7hRiwEFMag...",0,0,0,0,,3.33,0,0,0,0,0,0,0,0,0,0,0
18,jYnkJR3T8yCERXywoVhWYA,Hugo,48,2010-07-06,"hkXekeW_Jj6mIy8r8N7r1Q, dQDpV-VUtwYGqHznuRV-yw...",15,6,2,3,,3.73,2,0,0,0,0,1,1,1,1,2,0
44,fV8Yr0c5tFQTQ2SRRJHXHw,Michelle,50,2007-08-22,"HDb4fBWIAQ-foS8qLJty9w, x0hBZsmBTYxhjjx0MShz1A...",43,12,5,1,2009,3.96,2,4,0,0,0,9,7,7,7,4,0
74,aw973Pm1nrTbRjP4zY9B9g,Kenny,762,2008-09-23,"Cit5yho-DqotA0BnXHErTQ, bm2DqfP4P454FjEtCbZdkQ...",174,151,67,27,"2014, 2016, 2013, 2015, 2010, 2012, 2011",3.6,20,4,2,7,0,16,52,47,47,14,1


In [19]:
#Further filter the dataframe to only include users with over 200 reviews
df_power_user = df_friends[df_friends.review_count > 200]

In [20]:
df_power_user['num_friends'] = df_power_user.friends.apply(lambda x: x.count(',') + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_power_user['year'] = df_power_user.yelping_since.apply(lambda x: x.year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
#list of users with over 200 reviews and friends
power_user = list(df_power_user.user_id.unique())

### Dataframe for Scottsdale, with power users only

In [23]:
df.head()

Unnamed: 0,user_id,business_id,rev_stars,date,text,useful,funny,cool,name,neighborhood,address,city,state,postal_code,latitude,longitude,bus_stars,review_count,is_open,categories,weekday,text length,year
0,DUfWxxewcJgGcc1Y189XSQ,ZcAoKSDT6BNcWsn3o4-WVA,2,2017-01-07,Cool place packed to the rafters. Menu is pret...,1,0,0,"""Me & Julio""",,"""2784 S Fish Hatchery Rd""",Fitchburg,WI,53711,43.005728,-89.427154,3.0,102,1,Restaurants;Mexican;Tex-Mex;American (Traditio...,5,369,2017
1,oAJf-_URH4QBMs2ZMpjUyA,9VhZRkmyoEho58vmnMh_Hg,3,2016-04-11,"I've had mixed experiences at this Montana's, ...",0,0,0,"""Montana's BBQ & Bar""",,"""8735 Yonge St""",Richmond Hill,ON,L4C 6Z1,43.840823,-79.428224,3.0,49,1,Comfort Food;Restaurants;American (Traditional...,0,524,2016
2,SOb8RgcqBGbwlq9LZXy3fQ,IgOaTag1zhIXndxngP0M3w,1,2016-09-21,Die Angaben hier sind leider nicht mehr ganz k...,0,0,0,"""Restaurant Ganesha""",,"""Auberlenstr. 40""",Fellbach,BW,70736,48.818648,9.270925,3.0,8,1,Restaurants;Indian,2,125,2016
3,Iwe-xN6dnugdZ0KiQjO6Hg,Fvd8qeJCXFbMYQk45S7iBQ,5,2014-06-30,My family came in while waiting for our time a...,0,0,0,"""Las Fuentes Mexican Grill""",,"""13621 N Litchfield Rd""",Surprise,AZ,85374,33.607472,-112.355941,3.5,151,1,Mexican;Restaurants,0,256,2014
4,VSCpdfEtjdWjTbBVKMb5eA,mF2EW3twSrFPmT_RVV1-Qg,2,2011-11-18,I would not recommend this place for Chinese f...,3,5,0,"""House of Hunan""",,"""18 Public Sq""",Medina,OH,44256,41.138544,-81.864299,3.0,83,1,Restaurants;Chinese,4,992,2011


In [24]:
sdl_df = df[(df.city == 'Scottsdale') & (df.user_id.isin(power_user))]

In [25]:
sdl_df.head()

Unnamed: 0,user_id,business_id,rev_stars,date,text,useful,funny,cool,name,neighborhood,address,city,state,postal_code,latitude,longitude,bus_stars,review_count,is_open,categories,weekday,text length,year
284676,771OWzbzelsEeSlx8QsfsQ,orMlHMLyHSldYgzfFTaeoA,4,2011-07-15,I really like this place. I have been numerous...,1,0,0,"""Eddie's House""",,"""7042 E Indian School Rd""",Scottsdale,AZ,85251,33.495235,-111.929214,3.5,317,0,Seafood;American (New);Restaurants,4,509,2011
284680,Pj9YpaP0T7A_5S_PT8IpNg,OtVNeY6IfUh3CIOCzmfg3A,5,2012-08-15,Greasewood Flat is hugely popular with winter ...,4,3,2,"""Greasewood Flat""",,"""27375 N Alma School Pkwy""",Scottsdale,AZ,85255,33.731797,-111.846131,4.0,123,0,Burgers;Restaurants;Hot Dogs;Nightlife;Bars,2,1463,2012
284687,7M1zIE6OzpySDlqLU6MnEg,is2RonWgyENNKOprcXQK6w,5,2011-11-04,3 times in 2 weeks should tell you thy this pl...,3,2,2,"""Osha Thai Cafe""",,"""10953 N Frank Lloyd Wright Blvd, Ste 102""",Scottsdale,AZ,85259,33.585577,-111.834405,4.0,114,1,Salad;Restaurants;Soup;Thai,4,394,2011
284690,9uE0smG2bwgkI95RPj0lPQ,sKrlmbrZWCyLIgiMihCPqw,5,2015-11-16,I still love Hula's! Came here for a celebrat...,1,0,1,"""Hula's Modern Tiki Scottsdale""",,"""7213 E 1st Ave""",Scottsdale,AZ,85251,33.493853,-111.925625,4.0,509,1,Hawaiian;Restaurants;Pan Asian,0,560,2015
284691,iN7T3vQoC3v_MPs2TzH5Zw,ghRXtBi--SW8uWQhrijBjw,4,2013-11-25,"In the daytime, Old Town is a cool destination...",3,0,3,"""Old Town Scottsdale""",,"""""",Scottsdale,AZ,85251,33.498629,-111.92244,4.0,106,1,Shopping Centers;Transportation;Public Service...,0,1027,2013


In [26]:
print(f'Number of Users: {sdl_df.user_id.unique().shape[0]}')
print(f'Number of Restaurants: {sdl_df.business_id.unique().shape[0]}')
print(f'Number of Reviews: {sdl_df.text.unique().shape[0]}')

Number of Users: 3119
Number of Restaurants: 1210
Number of Reviews: 24738


In [27]:
stops = ['Restaurants', 'Food', 'Nightlife']

In [28]:
sdl_df['split_categories'] = sdl_df.categories.apply(lambda x: x.split(';'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
sdl_df.head()

Unnamed: 0,user_id,business_id,rev_stars,date,text,useful,funny,cool,name,neighborhood,address,city,state,postal_code,latitude,longitude,bus_stars,review_count,is_open,categories,weekday,text length,year,split_categories
284676,771OWzbzelsEeSlx8QsfsQ,orMlHMLyHSldYgzfFTaeoA,4,2011-07-15,I really like this place. I have been numerous...,1,0,0,"""Eddie's House""",,"""7042 E Indian School Rd""",Scottsdale,AZ,85251,33.495235,-111.929214,3.5,317,0,Seafood;American (New);Restaurants,4,509,2011,"[Seafood, American (New), Restaurants]"
284680,Pj9YpaP0T7A_5S_PT8IpNg,OtVNeY6IfUh3CIOCzmfg3A,5,2012-08-15,Greasewood Flat is hugely popular with winter ...,4,3,2,"""Greasewood Flat""",,"""27375 N Alma School Pkwy""",Scottsdale,AZ,85255,33.731797,-111.846131,4.0,123,0,Burgers;Restaurants;Hot Dogs;Nightlife;Bars,2,1463,2012,"[Burgers, Restaurants, Hot Dogs, Nightlife, Bars]"
284687,7M1zIE6OzpySDlqLU6MnEg,is2RonWgyENNKOprcXQK6w,5,2011-11-04,3 times in 2 weeks should tell you thy this pl...,3,2,2,"""Osha Thai Cafe""",,"""10953 N Frank Lloyd Wright Blvd, Ste 102""",Scottsdale,AZ,85259,33.585577,-111.834405,4.0,114,1,Salad;Restaurants;Soup;Thai,4,394,2011,"[Salad, Restaurants, Soup, Thai]"
284690,9uE0smG2bwgkI95RPj0lPQ,sKrlmbrZWCyLIgiMihCPqw,5,2015-11-16,I still love Hula's! Came here for a celebrat...,1,0,1,"""Hula's Modern Tiki Scottsdale""",,"""7213 E 1st Ave""",Scottsdale,AZ,85251,33.493853,-111.925625,4.0,509,1,Hawaiian;Restaurants;Pan Asian,0,560,2015,"[Hawaiian, Restaurants, Pan Asian]"
284691,iN7T3vQoC3v_MPs2TzH5Zw,ghRXtBi--SW8uWQhrijBjw,4,2013-11-25,"In the daytime, Old Town is a cool destination...",3,0,3,"""Old Town Scottsdale""",,"""""",Scottsdale,AZ,85251,33.498629,-111.92244,4.0,106,1,Shopping Centers;Transportation;Public Service...,0,1027,2013,"[Shopping Centers, Transportation, Public Serv..."


In [30]:
sdl_df['split_categories'] = sdl_df.split_categories.apply(lambda x: [i for i in x if i not in stops])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [31]:
sdl_df.head()

Unnamed: 0,user_id,business_id,rev_stars,date,text,useful,funny,cool,name,neighborhood,address,city,state,postal_code,latitude,longitude,bus_stars,review_count,is_open,categories,weekday,text length,year,split_categories
284676,771OWzbzelsEeSlx8QsfsQ,orMlHMLyHSldYgzfFTaeoA,4,2011-07-15,I really like this place. I have been numerous...,1,0,0,"""Eddie's House""",,"""7042 E Indian School Rd""",Scottsdale,AZ,85251,33.495235,-111.929214,3.5,317,0,Seafood;American (New);Restaurants,4,509,2011,"[Seafood, American (New)]"
284680,Pj9YpaP0T7A_5S_PT8IpNg,OtVNeY6IfUh3CIOCzmfg3A,5,2012-08-15,Greasewood Flat is hugely popular with winter ...,4,3,2,"""Greasewood Flat""",,"""27375 N Alma School Pkwy""",Scottsdale,AZ,85255,33.731797,-111.846131,4.0,123,0,Burgers;Restaurants;Hot Dogs;Nightlife;Bars,2,1463,2012,"[Burgers, Hot Dogs, Bars]"
284687,7M1zIE6OzpySDlqLU6MnEg,is2RonWgyENNKOprcXQK6w,5,2011-11-04,3 times in 2 weeks should tell you thy this pl...,3,2,2,"""Osha Thai Cafe""",,"""10953 N Frank Lloyd Wright Blvd, Ste 102""",Scottsdale,AZ,85259,33.585577,-111.834405,4.0,114,1,Salad;Restaurants;Soup;Thai,4,394,2011,"[Salad, Soup, Thai]"
284690,9uE0smG2bwgkI95RPj0lPQ,sKrlmbrZWCyLIgiMihCPqw,5,2015-11-16,I still love Hula's! Came here for a celebrat...,1,0,1,"""Hula's Modern Tiki Scottsdale""",,"""7213 E 1st Ave""",Scottsdale,AZ,85251,33.493853,-111.925625,4.0,509,1,Hawaiian;Restaurants;Pan Asian,0,560,2015,"[Hawaiian, Pan Asian]"
284691,iN7T3vQoC3v_MPs2TzH5Zw,ghRXtBi--SW8uWQhrijBjw,4,2013-11-25,"In the daytime, Old Town is a cool destination...",3,0,3,"""Old Town Scottsdale""",,"""""",Scottsdale,AZ,85251,33.498629,-111.92244,4.0,106,1,Shopping Centers;Transportation;Public Service...,0,1027,2013,"[Shopping Centers, Transportation, Public Serv..."


In [33]:
type(sdl_df.split_categories.iloc[0])

list

In [34]:
sdl_df['string_categories'] = sdl_df.split_categories.apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
sdl_df.head()

Unnamed: 0,user_id,business_id,rev_stars,date,text,useful,funny,cool,name,neighborhood,address,city,state,postal_code,latitude,longitude,bus_stars,review_count,is_open,categories,weekday,text length,year,split_categories,string_categories
284676,771OWzbzelsEeSlx8QsfsQ,orMlHMLyHSldYgzfFTaeoA,4,2011-07-15,I really like this place. I have been numerous...,1,0,0,"""Eddie's House""",,"""7042 E Indian School Rd""",Scottsdale,AZ,85251,33.495235,-111.929214,3.5,317,0,Seafood;American (New);Restaurants,4,509,2011,"[Seafood, American (New)]",Seafood American (New)
284680,Pj9YpaP0T7A_5S_PT8IpNg,OtVNeY6IfUh3CIOCzmfg3A,5,2012-08-15,Greasewood Flat is hugely popular with winter ...,4,3,2,"""Greasewood Flat""",,"""27375 N Alma School Pkwy""",Scottsdale,AZ,85255,33.731797,-111.846131,4.0,123,0,Burgers;Restaurants;Hot Dogs;Nightlife;Bars,2,1463,2012,"[Burgers, Hot Dogs, Bars]",Burgers Hot Dogs Bars
284687,7M1zIE6OzpySDlqLU6MnEg,is2RonWgyENNKOprcXQK6w,5,2011-11-04,3 times in 2 weeks should tell you thy this pl...,3,2,2,"""Osha Thai Cafe""",,"""10953 N Frank Lloyd Wright Blvd, Ste 102""",Scottsdale,AZ,85259,33.585577,-111.834405,4.0,114,1,Salad;Restaurants;Soup;Thai,4,394,2011,"[Salad, Soup, Thai]",Salad Soup Thai
284690,9uE0smG2bwgkI95RPj0lPQ,sKrlmbrZWCyLIgiMihCPqw,5,2015-11-16,I still love Hula's! Came here for a celebrat...,1,0,1,"""Hula's Modern Tiki Scottsdale""",,"""7213 E 1st Ave""",Scottsdale,AZ,85251,33.493853,-111.925625,4.0,509,1,Hawaiian;Restaurants;Pan Asian,0,560,2015,"[Hawaiian, Pan Asian]",Hawaiian Pan Asian
284691,iN7T3vQoC3v_MPs2TzH5Zw,ghRXtBi--SW8uWQhrijBjw,4,2013-11-25,"In the daytime, Old Town is a cool destination...",3,0,3,"""Old Town Scottsdale""",,"""""",Scottsdale,AZ,85251,33.498629,-111.92244,4.0,106,1,Shopping Centers;Transportation;Public Service...,0,1027,2013,"[Shopping Centers, Transportation, Public Serv...",Shopping Centers Transportation Public Service...


In [36]:
sdl_df.to_csv('sdl_data.csv', encoding='utf-8')