# Feature selection for Final Dataset

In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from datetime import datetime
import re
from collections import Counter

# User Features

In [2]:
user_df = pd.read_csv("../data/user.tsv", sep='\t', low_memory=False)
user_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,3.98,2957,272,2957,3314,31,206,1300,1497,1353,...,18667,201020112012,424,"gZpsf8ZCCtksN702b0xvcg, 8U8Z7cyA7bwMZVLQ2Cmktw...",15521,Tram,2467,19997,0QeJC2inz6P-OVzROU_LNw,2009-03-10 13:55:45
1,3.94,61,6,61,67,1,1,16,6,37,...,377,20072008200920102011,28,"Pkq6CszRZ6zjqJjjrm8f5g, T32n_y9R2lucMf9P2Lc8Rw...",163,Valarie,250,561,jJlYRJXPdIolKbuqGiZ-CQ,2006-05-04 21:15:47
2,3.55,1,0,1,2,0,0,4,1,3,...,33,2011201220132014,4,"riS5TJ4iyQK0L9g4QlhO8w, 4rU8Yyfa9dfUiMqfnX5jdg...",26,Yvonne,249,120,jQlrCll7ygS1ZS1vyEi94A,2010-09-29 06:45:52
3,4.21,5,1,5,1,0,1,1,0,2,...,58,,7,"rgEqPUDzo3zkq1ociN_Xqw, FEXWp5u90I88mbILkTLlvQ...",42,Jenn,63,125,DyQpYGALYV-WmKfb5rThGw,2011-01-14 21:18:08
4,3.4,44,0,44,15,1,5,18,2,14,...,226,201120122013201420152016,18,"c5XbGsVMiyC6RYGutY_cZg, CnQCgS1GIffv7l6vGgMvYA...",182,Asina,263,666,an-ijVpGQxgiOBX9qTBOyQ,2010-12-01 19:40:25


Remove users who never gave reviews

In [3]:
user_df = user_df[user_df['review_count'] > 0]

In [4]:
nb_user = len(user_df)
nb_user

1636115

In [5]:
NAs = dict()
for i in range(len(user_df.columns)):
    NAs[user_df.columns[i]] = sum(pd.isna(user_df[user_df.columns[i]]))*100/nb_user
    
for key in sorted(NAs, key=NAs.get, reverse=True):
    print('NA for {:35s}: {:.2f}%'.format(key, NAs[key]))

NA for elite                              : 95.67%
NA for name                               : 0.00%
NA for average_stars                      : 0.00%
NA for compliment_cool                    : 0.00%
NA for compliment_cute                    : 0.00%
NA for compliment_funny                   : 0.00%
NA for compliment_hot                     : 0.00%
NA for compliment_list                    : 0.00%
NA for compliment_more                    : 0.00%
NA for compliment_note                    : 0.00%
NA for compliment_photos                  : 0.00%
NA for compliment_plain                   : 0.00%
NA for compliment_profile                 : 0.00%
NA for compliment_writer                  : 0.00%
NA for cool                               : 0.00%
NA for fans                               : 0.00%
NA for friends                            : 0.00%
NA for funny                              : 0.00%
NA for review_count                       : 0.00%
NA for useful                             : 0.00%

Add seniority feature

In [6]:
user_df['yelping_since'] = list(map(lambda d: datetime.strptime(d, '%Y-%m-%d %H:%M:%S'), user_df['yelping_since']))

In [7]:
user_df['seniority'] = list(map(lambda d: int((datetime.strptime('2019-12-31', '%Y-%m-%d')-d).days), user_df['yelping_since']))

Add number of friends features

In [8]:
user_df['nb_friends'] = list(map(lambda d: len(d.split(',')), user_df['friends']))

Remove elite and generic columns

In [9]:
user_df.drop(['elite','name', 'yelping_since', 'friends'], axis=1, inplace=True)

In [10]:
user_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,fans,funny,review_count,useful,user_id,seniority,nb_friends
0,3.98,2957,272,2957,3314,31,206,1300,1497,1353,206,1175,18667,424,15521,2467,19997,0QeJC2inz6P-OVzROU_LNw,3947,1703
1,3.94,61,6,61,67,1,1,16,6,37,4,5,377,28,163,250,561,jJlYRJXPdIolKbuqGiZ-CQ,4988,366
2,3.55,1,0,1,2,0,0,4,1,3,0,1,33,4,26,249,120,jQlrCll7ygS1ZS1vyEi94A,3379,30
3,4.21,5,1,5,1,0,1,1,0,2,0,0,58,7,42,63,125,DyQpYGALYV-WmKfb5rThGw,3272,623
4,3.4,44,0,44,15,1,5,18,2,14,1,25,226,18,182,263,666,an-ijVpGQxgiOBX9qTBOyQ,3316,332


In [11]:
user_describe = user_df.describe(percentiles=[.25, .5, .75, 0.9, 0.95, 0.99])
user_describe

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,fans,funny,review_count,useful,seniority,nb_friends
count,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0
mean,3.681354,3.000206,0.1833374,3.000206,2.143368,0.08095947,0.3240047,1.463122,1.091794,2.991392,0.2065613,1.128603,21.1816,1.435496,17.37816,22.13325,39.62589,2204.301,45.81048
std,1.151272,85.93341,12.5863,85.93341,74.03846,10.88376,13.5676,62.86987,94.43429,90.66626,16.20711,31.97,381.726,15.24304,314.4796,75.32858,446.8724,946.3905,135.0935
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,411.0,1.0
25%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1480.0,1.0
50%,3.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,2108.0,2.0
75%,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,15.0,13.0,2890.0,39.0
90%,5.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,13.0,2.0,13.0,45.0,48.0,3475.0,128.0
95%,5.0,3.0,0.0,3.0,1.0,0.0,1.0,2.0,1.0,3.0,0.0,2.0,36.0,4.0,33.0,89.0,110.0,3850.0,218.0
99%,5.0,31.0,2.0,31.0,19.0,1.0,5.0,17.0,7.0,27.0,2.0,15.0,294.0,26.0,236.0,302.0,587.0,4523.0,536.0


In [12]:
columns_to_remove = []
for i in range(len(user_describe.columns)):
    q0 = user_describe[user_describe.columns[i]]['min']
    q90 = user_describe[user_describe.columns[i]]['90%']
    if q0 == q90:
        columns_to_remove.append(user_describe.columns[i])
columns_to_remove

['compliment_cute',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_photos',
 'compliment_profile',
 'compliment_writer']

In [13]:
user_df.drop(columns_to_remove, axis=1, inplace=True)

In [14]:
user_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_funny,compliment_note,compliment_plain,cool,fans,funny,review_count,useful,user_id,seniority,nb_friends
0,3.98,2957,2957,1300,1353,18667,424,15521,2467,19997,0QeJC2inz6P-OVzROU_LNw,3947,1703
1,3.94,61,61,16,37,377,28,163,250,561,jJlYRJXPdIolKbuqGiZ-CQ,4988,366
2,3.55,1,1,4,3,33,4,26,249,120,jQlrCll7ygS1ZS1vyEi94A,3379,30
3,4.21,5,5,1,2,58,7,42,63,125,DyQpYGALYV-WmKfb5rThGw,3272,623
4,3.4,44,44,18,14,226,18,182,263,666,an-ijVpGQxgiOBX9qTBOyQ,3316,332


In [15]:
user_df.describe(percentiles=[.25, .5, .75, 0.9, 0.95, 0.99])

Unnamed: 0,average_stars,compliment_cool,compliment_funny,compliment_note,compliment_plain,cool,fans,funny,review_count,useful,seniority,nb_friends
count,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0,1636115.0
mean,3.681354,3.000206,3.000206,1.463122,2.991392,21.1816,1.435496,17.37816,22.13325,39.62589,2204.301,45.81048
std,1.151272,85.93341,85.93341,62.86987,90.66626,381.726,15.24304,314.4796,75.32858,446.8724,946.3905,135.0935
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,411.0,1.0
25%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1480.0,1.0
50%,3.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,2108.0,2.0
75%,4.6,0.0,0.0,0.0,0.0,3.0,0.0,3.0,15.0,13.0,2890.0,39.0
90%,5.0,1.0,1.0,1.0,1.0,13.0,2.0,13.0,45.0,48.0,3475.0,128.0
95%,5.0,3.0,3.0,2.0,3.0,36.0,4.0,33.0,89.0,110.0,3850.0,218.0
99%,5.0,31.0,31.0,17.0,27.0,294.0,26.0,236.0,302.0,587.0,4523.0,536.0


In [16]:
user_features = list(user_df.columns)
user_features

['average_stars',
 'compliment_cool',
 'compliment_funny',
 'compliment_note',
 'compliment_plain',
 'cool',
 'fans',
 'funny',
 'review_count',
 'useful',
 'user_id',
 'seniority',
 'nb_friends']

# Business features

In [17]:
business_df = pd.read_csv("../data/business.tsv", sep='\t', low_memory=False)
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,4 E University Dr,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",6KgGE8B1RsR7jc9R5nuH0Q,"American (Traditional), Restaurants",Tempe,,0,33.422192,-111.939615,Ruby Tuesday,85281,9,2.5,AZ
1,5588 Yonge Street,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...",emyCP3Ry2SbpNrwRAtm9PQ,"Restaurants, Hot Pot, Korean, Asian Fusion",North York,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,43.77941,-79.415798,Pyung Won House,M2N 5S2,66,2.5,ON
2,617 Mount Pleasant Road,"{'BusinessParking': ""{'garage': False, 'street...",YH8Bn-7pLR-SFR8MCgQj1w,"Bakeries, Food, Mediterranean, French, Restaur...",Toronto,"{'Tuesday': '8:0-18:0', 'Wednesday': '8:0-18:0...",1,43.704166,-79.388212,Jules Cafe Patisserie,M4S 2M5,44,3.5,ON
3,5969 State Rd,,RLyqeVI4a-019BRK-9IhzQ,"Shoe Repair, Local Services",Cleveland,,1,41.400652,-81.710151,Angelo's Cobbler Shoppe,44134,3,5.0,OH
4,296 Rue Champlain,"{'OutdoorSeating': 'True', 'RestaurantsReserva...",OY1kLAhs9I6Ix4wUmSNAfQ,"Pubs, Gastropubs, Restaurants, Nightlife, Bars",Saint-Jean-sur-Richelieu,"{'Tuesday': '11:0-3:0', 'Wednesday': '11:0-3:0...",1,45.310162,-73.252416,Morgane Bistro & Pub,J3B 6W2,4,4.0,QC


Remove business which don't reviews

In [18]:
business_df = business_df[business_df['review_count'] > 0]

In [19]:
nb_business = len(business_df)
nb_business

191609

In [20]:
NAs = dict()
for i in range(len(business_df.columns)):
    NAs[business_df.columns[i]] = sum(pd.isna(business_df[business_df.columns[i]]))*100/nb_business
    
for key in sorted(NAs, key=NAs.get, reverse=True):
    print('NA for {:35s}: {:.2f}%'.format(key, NAs[key]))

NA for hours                              : 23.27%
NA for attributes                         : 14.97%
NA for address                            : 3.99%
NA for postal_code                        : 0.34%
NA for categories                         : 0.25%
NA for city                               : 0.00%
NA for business_id                        : 0.00%
NA for is_open                            : 0.00%
NA for latitude                           : 0.00%
NA for longitude                          : 0.00%
NA for name                               : 0.00%
NA for review_count                       : 0.00%
NA for stars                              : 0.00%
NA for state                              : 0.00%


In [21]:
business_df.dropna(inplace=True)

In [22]:
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
1,5588 Yonge Street,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...",emyCP3Ry2SbpNrwRAtm9PQ,"Restaurants, Hot Pot, Korean, Asian Fusion",North York,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,43.77941,-79.415798,Pyung Won House,M2N 5S2,66,2.5,ON
2,617 Mount Pleasant Road,"{'BusinessParking': ""{'garage': False, 'street...",YH8Bn-7pLR-SFR8MCgQj1w,"Bakeries, Food, Mediterranean, French, Restaur...",Toronto,"{'Tuesday': '8:0-18:0', 'Wednesday': '8:0-18:0...",1,43.704166,-79.388212,Jules Cafe Patisserie,M4S 2M5,44,3.5,ON
4,296 Rue Champlain,"{'OutdoorSeating': 'True', 'RestaurantsReserva...",OY1kLAhs9I6Ix4wUmSNAfQ,"Pubs, Gastropubs, Restaurants, Nightlife, Bars",Saint-Jean-sur-Richelieu,"{'Tuesday': '11:0-3:0', 'Wednesday': '11:0-3:0...",1,45.310162,-73.252416,Morgane Bistro & Pub,J3B 6W2,4,4.0,QC
6,1216 East Washington Ave,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...",J0P152h7wimvdJ-aV0QLug,"Food, Coffee & Tea",Madison,"{'Monday': '6:0-19:0', 'Tuesday': '6:0-19:0', ...",1,43.086294,-89.368132,Stone Creek Coffee,53703,38,4.5,WI
8,2541 N Cherry Rd,"{'RestaurantsTakeOut': 'True', 'BikeParking': ...",pvXdMR9tcQlwXcXJLllmPg,"Automotive, Convenience Stores, Gas Stations, ...",Rock Hill,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0,34.971428,-80.990601,Kangaroo Express,29733,3,4.0,SC


In [23]:
def unnest_dictionary(nested_dico):
    unnested_dico = nested_dico.copy()
    
    finished = False
    while not finished:
        finished = True

        keys = list(unnested_dico.keys())

        for key in keys:

            attributes = str(unnested_dico[key])
            if re.match(r'{.*:.*}', attributes):

                # Sub dico with the keys
                key_dico = eval(attributes)
                sub_dico = {'{}_{}'.format(key, elt): str(key_dico[elt]) for elt in key_dico}

                # Update output dictionary
                unnested_dico.pop(key, None)
                unnested_dico = {**unnested_dico, **sub_dico}

                # Since we found at least one value which one a dictionary we set finished
                # to false to do another loop and the check if the value that was unnested
                # could be even more unnested
                finished = False

    unnested_dico = {key: re.sub(r"[a-z]'(.*)'", r'\1', str(unnested_dico[key])) for key in unnested_dico}
    unnested_dico = {key: re.sub(r"'", '', unnested_dico[key]) for key in unnested_dico}
    return unnested_dico

In [24]:
nested_key = 'attributes'
business_df[nested_key] = list(map(lambda x: unnest_dictionary(eval(str(x))), business_df[nested_key].values))

In [25]:
nested_key = 'hours'
business_df[nested_key] = list(map(lambda x: unnest_dictionary(eval(str(x))), business_df[nested_key].values))

In [26]:
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
1,5588 Yonge Street,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...",emyCP3Ry2SbpNrwRAtm9PQ,"Restaurants, Hot Pot, Korean, Asian Fusion",North York,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,43.77941,-79.415798,Pyung Won House,M2N 5S2,66,2.5,ON
2,617 Mount Pleasant Road,"{'RestaurantsAttire': 'casual', 'RestaurantsGo...",YH8Bn-7pLR-SFR8MCgQj1w,"Bakeries, Food, Mediterranean, French, Restaur...",Toronto,"{'Tuesday': '8:0-18:0', 'Wednesday': '8:0-18:0...",1,43.704166,-79.388212,Jules Cafe Patisserie,M4S 2M5,44,3.5,ON
4,296 Rue Champlain,"{'OutdoorSeating': 'True', 'RestaurantsReserva...",OY1kLAhs9I6Ix4wUmSNAfQ,"Pubs, Gastropubs, Restaurants, Nightlife, Bars",Saint-Jean-sur-Richelieu,"{'Tuesday': '11:0-3:0', 'Wednesday': '11:0-3:0...",1,45.310162,-73.252416,Morgane Bistro & Pub,J3B 6W2,4,4.0,QC
6,1216 East Washington Ave,"{'Caters': 'False', 'RestaurantsTakeOut': 'Tru...",J0P152h7wimvdJ-aV0QLug,"Food, Coffee & Tea",Madison,"{'Monday': '6:0-19:0', 'Tuesday': '6:0-19:0', ...",1,43.086294,-89.368132,Stone Creek Coffee,53703,38,4.5,WI
8,2541 N Cherry Rd,"{'RestaurantsTakeOut': 'True', 'BikeParking': ...",pvXdMR9tcQlwXcXJLllmPg,"Automotive, Convenience Stores, Gas Stations, ...",Rock Hill,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0,34.971428,-80.990601,Kangaroo Express,29733,3,4.0,SC


In [27]:
business_df = pd.concat([business_df.drop(['attributes'], axis=1), business_df['attributes'].apply(pd.Series)], axis=1)

In [28]:
business_df = pd.concat([business_df.drop(['hours'], axis=1), business_df['hours'].apply(pd.Series)], axis=1)

In [29]:
nb_business = len(business_df)
nb_business

123933

In [30]:
NAs = dict()
for i in range(len(business_df.columns)):
    NAs[business_df.columns[i]] = sum(pd.isna(business_df[business_df.columns[i]]))*100/nb_business

In [31]:
for key in sorted(NAs, key=NAs.get, reverse=True):
    print('NA for {:35s}: {:.2f}%'.format(key, NAs[key]))

NA for BestNights                         : 99.99%
NA for DietaryRestrictions                : 99.99%
NA for RestaurantsCounterService          : 99.99%
NA for Open24Hours                        : 99.99%
NA for BYOB                               : 99.98%
NA for HairSpecializesIn                  : 99.98%
NA for DietaryRestrictions_dairy-free     : 99.96%
NA for DietaryRestrictions_gluten-free    : 99.96%
NA for DietaryRestrictions_vegan          : 99.96%
NA for DietaryRestrictions_kosher         : 99.96%
NA for DietaryRestrictions_halal          : 99.96%
NA for DietaryRestrictions_soy-free       : 99.96%
NA for DietaryRestrictions_vegetarian     : 99.96%
NA for AgesAllowed                        : 99.91%
NA for Music                              : 99.86%
NA for GoodForMeal                        : 99.85%
NA for Ambience                           : 99.67%
NA for Corkage                            : 99.57%
NA for HairSpecializesIn_straightperms    : 99.38%
NA for HairSpecializesIn_africa

In [32]:
d = dict(Counter(business_df['categories']))
categories = list(set(business_df['categories'].values))
values = [d[x] for x in categories]
category_count = pd.DataFrame.from_dict({'Category': categories, 'Nb': values})

In [33]:
category_count.head()

Unnamed: 0,Category,Nb
0,"French, Italian, Restaurants",4
1,"Nightlife, Breweries, Food, Bars, Beer Gardens...",1
2,"Fashion, Men's Clothing, Accessories, Women's ...",1
3,"Lounges, Adult Entertainment, Strip Clubs, Nig...",1
4,"Nightlife, Art Galleries, Music Venues, Person...",1


In [34]:
len(category_count['Nb'])

72082

# Review Features

In [35]:
review_df = pd.read_csv("../data/review.tsv", sep='\t', low_memory=False)
review_df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,039QG6ks4UYNkMA3mcII2A,0.0,2012-05-28 16:42:56,0.0,qVBtM3H5xFUMweX1qGsOnA,5.0,My boyfriend and I were really looking forward...,8.0,sThIuQa4EE8WocfxVFWMlA
1,vx4YAA02Qz6khRD1fZ1MFA,1.0,2012-06-25 23:45:43,0.0,s8x0OQNXxIlhepz1m_ofjg,3.0,Not bad at all. The standard dishes - chinese...,2.0,llyK5_o-6L9H5QG2rcROmA
2,k2b3niokS_tosjah_rzCPw,5.0,2017-02-19 06:21:58,2.0,gi_-cTGatAcR5Ko_AI1Paw,4.0,I ordered the Heart Shaped pizza and meatball ...,4.0,uc9ITBuspRFkl-S3Bo90dg
3,77h11eWv6HKJAgojLx8G4w,0.0,2017-01-02 05:04:15,0.0,Bs-HZwbbqeWpgzl5T1_QyA,4.0,"I knew coming here, there would be a long line...",0.0,EIRBAYXCV3647N3ejEwkIA
4,-BJVR_DO5r-MfQ2tiszeOg,1.0,2010-08-21 06:53:33,0.0,VRDNOiSG6h2tYXPKwpf75w,5.0,Wow...no reviews on here yet? I am surprised.....,9.0,Mq8Su0PZC8D4EaJrLJOhYw


In [36]:
nb_reviews = len(review_df)
NAs = dict()
for i in range(len(review_df.columns)):
    NAs[review_df.columns[i]] = sum(pd.isna(review_df[review_df.columns[i]]))*100/nb_reviews
    
for key in sorted(NAs, key=NAs.get, reverse=True):
    print('NA for {:35s}: {:.6f}%'.format(key, NAs[key]))

NA for useful                             : 0.000568%
NA for user_id                            : 0.000568%
NA for text                               : 0.000404%
NA for funny                              : 0.000374%
NA for review_id                          : 0.000374%
NA for stars                              : 0.000374%
NA for cool                               : 0.000180%
NA for date                               : 0.000180%
NA for business_id                        : 0.000000%


In [37]:
review_df.dropna(inplace=True)

In [38]:
review_df['label'] = list(map(lambda x: int(int(x) == 5), review_df['stars'].values))
review_df = review_df[['review_id', 'business_id', 'user_id', 'label']]

In [39]:
review_df.head()

Unnamed: 0,review_id,business_id,user_id,label
0,qVBtM3H5xFUMweX1qGsOnA,039QG6ks4UYNkMA3mcII2A,sThIuQa4EE8WocfxVFWMlA,1
1,s8x0OQNXxIlhepz1m_ofjg,vx4YAA02Qz6khRD1fZ1MFA,llyK5_o-6L9H5QG2rcROmA,0
2,gi_-cTGatAcR5Ko_AI1Paw,k2b3niokS_tosjah_rzCPw,uc9ITBuspRFkl-S3Bo90dg,0
3,Bs-HZwbbqeWpgzl5T1_QyA,77h11eWv6HKJAgojLx8G4w,EIRBAYXCV3647N3ejEwkIA,0
4,VRDNOiSG6h2tYXPKwpf75w,-BJVR_DO5r-MfQ2tiszeOg,Mq8Su0PZC8D4EaJrLJOhYw,1


In [40]:
nb_reviews = len(review_df)
nb_reviews

6684885

# Train Validation Test Split

In [41]:
def dataset_split(proba, train, val):
    if proba < train:
        return 'train'
    elif proba < train + val:
        return 'validation'
    else:
        return 'test'

In [106]:
np.random.seed(42)

In [107]:
nb_users = len(user_df)
user_df['user_dataset'] = np.random.random(size=nb_users)
user_df['user_dataset'] = list(map(lambda p: dataset_split(p, 0.5, 0.3), user_df['user_dataset']))

In [108]:
nb_business = len(business_df)
business_df['business_dataset'] = np.random.random(size=nb_business)
business_df['business_dataset'] = list(map(lambda p: dataset_split(p, 0.55, 0.2), business_df['business_dataset']))

In [109]:
df = pd.merge(review_df, user_df, on='user_id', how='inner')
df = pd.merge(df, business_df, on='business_id', how='inner')

In [110]:
df['dataset'] = df['user_dataset'] == df['business_dataset']

In [111]:
df['dataset'] = df['user_dataset'] * list(map(int, df['dataset']))

In [112]:
df = df[df['dataset'] != '']
df.head()

Unnamed: 0,review_id,business_id,user_id,label,average_stars,compliment_cool,compliment_funny,compliment_note,compliment_plain,cool,...,RestaurantsCounterService,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,business_dataset,dataset
0,qVBtM3H5xFUMweX1qGsOnA,039QG6ks4UYNkMA3mcII2A,sThIuQa4EE8WocfxVFWMlA,1,4.68,1,1,0,0,10,...,,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,10:0-16:0,validation,validation
3,32afZohbF1u3y7MeiusVHg,039QG6ks4UYNkMA3mcII2A,LHKSDD_2JJrsP5LyklZBJQ,1,5.0,1,1,1,1,2,...,,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,10:0-16:0,validation,validation
6,EHBzcIp3HRNMzCsrRQmVlw,039QG6ks4UYNkMA3mcII2A,NsHeN5dKRmYfJTmTxMITmQ,1,3.9,103,103,44,32,764,...,,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,10:0-16:0,validation,validation
10,MyHcKIdPm8utBElSIwB-Dw,039QG6ks4UYNkMA3mcII2A,51F0hQKX8I4bdJFiovcC4A,1,4.58,1,1,0,0,7,...,,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,10:0-16:0,validation,validation
18,wynw5xnlkc74SuZJwrGOng,039QG6ks4UYNkMA3mcII2A,PnP3VHtR3SZTdKC_TgrVag,0,3.5,1,1,0,0,9,...,,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,9:0-19:0,10:0-16:0,validation,validation


In [113]:
print('='*80)
print('RAW REVIEWS DATASET DESCRIPTION')
nb_reviews = len(review_df)
pos_reviews = sum(review_df['label'])
neg_revews = nb_reviews - pos_reviews

print('Total reviews: {:,d}'.format(nb_reviews))
print('Positive reviews: {:,d} ({:.2f}%)'.format(pos_reviews, 100*pos_reviews/nb_reviews))
print('Negative reviews: {:,d} ({:.2f}%)'.format(neg_revews, 100*neg_revews/nb_reviews))
print('='*80)

print('REVIEWS DATASET AFTER SPLIT DESCRIPTION')
nb_reviews = len(df)
pos_reviews = sum(df['label'])
neg_revews = nb_reviews - pos_reviews
train_reviews = len(df[df['dataset'] == 'train'])
validation_reviews = len(df[df['dataset'] == 'validation'])
test_reviews = len(df[df['dataset'] == 'test'])

print('Total reviews: {:,d} - {:.2f}% of total reviews'.format(nb_reviews, 100*nb_reviews/len(review_df)))
print('Positive reviews: {:,d} ({:.2f}%)'.format(pos_reviews, 100*pos_reviews/nb_reviews))
print('Negative reviews: {:,d} ({:.2f}%)'.format(neg_revews, 100*neg_revews/nb_reviews))
print()
print('Train reviews: {:,d} ({:.2f}%)'.format(train_reviews, 100*train_reviews/nb_reviews))
print('Validation reviews: {:,d} ({:.2f}%)'.format(validation_reviews, 100*validation_reviews/nb_reviews))
print('Test reviews: {:,d} ({:.2f}%)'.format(test_reviews, 100*test_reviews/nb_reviews))
print('='*80)

print('TRAIN REVIEWS DATASET DESCRIPTION')
nb_reviews = len(df[df['dataset'] == 'train'])
pos_reviews = sum(df[df['dataset'] == 'train']['label'])
neg_revews = nb_reviews - pos_reviews

print('Total reviews: {:,d}'.format(nb_reviews))
print('Positive reviews: {:,d} ({:.2f}%)'.format(pos_reviews, 100*pos_reviews/nb_reviews))
print('Negative reviews: {:,d} ({:.2f}%)'.format(neg_revews, 100*neg_revews/nb_reviews))
print('='*80)

print('VALIDATION REVIEWS DATASET DESCRIPTION')
nb_reviews = len(df[df['dataset'] == 'validation'])
pos_reviews = sum(df[df['dataset'] == 'validation']['label'])
neg_revews = nb_reviews - pos_reviews

print('Total reviews: {:,d}'.format(nb_reviews))
print('Positive reviews: {:,d} ({:.2f}%)'.format(pos_reviews, 100*pos_reviews/nb_reviews))
print('Negative reviews: {:,d} ({:.2f}%)'.format(neg_revews, 100*neg_revews/nb_reviews))
print('='*80)

print('TEST REVIEWS DATASET DESCRIPTION')
nb_reviews = len(df[df['dataset'] == 'test'])
pos_reviews = sum(df[df['dataset'] == 'test']['label'])
neg_revews = nb_reviews - pos_reviews

print('Total reviews: {:,d}'.format(nb_reviews))
print('Positive reviews: {:,d} ({:.2f}%)'.format(pos_reviews, 100*pos_reviews/nb_reviews))
print('Negative reviews: {:,d} ({:.2f}%)'.format(neg_revews, 100*neg_revews/nb_reviews))

RAW REVIEWS DATASET DESCRIPTION
Total reviews: 6,684,885
Positive reviews: 2,932,663 (43.87%)
Negative reviews: 3,752,222 (56.13%)
REVIEWS DATASET AFTER SPLIT DESCRIPTION
Total reviews: 2,239,584 - 33.50% of total reviews
Positive reviews: 989,418 (44.18%)
Negative reviews: 1,250,166 (55.82%)

Train reviews: 1,597,389 (71.33%)
Validation reviews: 353,648 (15.79%)
Test reviews: 288,547 (12.88%)
TRAIN REVIEWS DATASET DESCRIPTION
Total reviews: 1,597,389
Positive reviews: 703,455 (44.04%)
Negative reviews: 893,934 (55.96%)
VALIDATION REVIEWS DATASET DESCRIPTION
Total reviews: 353,648
Positive reviews: 156,437 (44.24%)
Negative reviews: 197,211 (55.76%)
TEST REVIEWS DATASET DESCRIPTION
Total reviews: 288,547
Positive reviews: 129,526 (44.89%)
Negative reviews: 159,021 (55.11%)
