In [1]:
import numpy as np
import pandas as pd
# import json
import ast

In [2]:
# b_data is short for business dataset
# lines = True helps with the Trailing data ValueError

b_data = pd.read_json('../../data_projects/data/yelp/yelp_dataset/yelp_academic_dataset_business.json', lines = True)

In [3]:
b_data.drop(b_data.columns.difference(['business_id', 'attributes']), axis = 1, inplace = True)

In [4]:
b_data

Unnamed: 0,business_id,attributes
0,Pns2l4eNsfO8kk83dixA6A,{'ByAppointmentOnly': 'True'}
1,mpf3x-BjTdTEA3yCZrAYPw,{'BusinessAcceptsCreditCards': 'True'}
2,tUFrWirKiKi_TAnsVWINQQ,"{'BikeParking': 'True', 'BusinessAcceptsCredit..."
3,MTSW4McQd7CbVtyjqoe9mw,"{'RestaurantsDelivery': 'False', 'OutdoorSeati..."
4,mWMc6_wTdE0EUBKIGXDVfA,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc..."
...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"{'ByAppointmentOnly': 'False', 'RestaurantsPri..."
150342,c8GjPIOTGVmIemT7j5_SyQ,"{'BusinessAcceptsCreditCards': 'True', 'Restau..."
150343,_QAMST-NrQobXduilWEqSw,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."
150344,mtGm22y5c2UHNXDFAjaPNw,"{'BusinessParking': '{'garage': False, 'street..."


In [5]:
att_df = pd.json_normalize(b_data['attributes'].to_list())
att_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   ByAppointmentOnly           42339 non-null   object
 1   BusinessAcceptsCreditCards  119765 non-null  object
 2   BikeParking                 72638 non-null   object
 3   RestaurantsPriceRange2      85314 non-null   object
 4   CoatCheck                   5584 non-null    object
 5   RestaurantsTakeOut          59857 non-null   object
 6   RestaurantsDelivery         56282 non-null   object
 7   Caters                      40127 non-null   object
 8   WiFi                        56914 non-null   object
 9   BusinessParking             91085 non-null   object
 10  WheelchairAccessible        28953 non-null   object
 11  HappyHour                   15171 non-null   object
 12  OutdoorSeating              48802 non-null   object
 13  HasTV                       4

### Deciding Which Columns to Remove

We will be dropping some columns because the amount of information they convey relative to the amount of businesses they pertain to is small.

In [6]:
att_df.drop(
  columns = ['HairSpecializesIn', 'DietaryRestrictions', 'Open24Hours', 'AgesAllowed', 'RestaurantsCounterService', 'CoatCheck'],
  axis = 1,
  inplace = True
)

In [7]:
# These are the columns that may have values that are not booleans.

for col in att_df.columns.to_list():
    if len(att_df[att_df[col].notnull()][col].value_counts().to_list()) > 3:
        print (col)

RestaurantsPriceRange2
WiFi
BusinessParking
Alcohol
RestaurantsAttire
Ambience
NoiseLevel
GoodForMeal
Smoking
Music
BestNights
BYOBCorkage


In [8]:
# Lets us explore the values of the columns.
# Will be helpful for knowing if other columns consist of JSON data
# Columns that have leading 'u': Alcohol, NoiseLevel, WiFi, RestaurantsAttire, Smoking, BYOBCorkage
# Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights

column_string = 'Ambience'

print (att_df[att_df[column_string].notnull()][column_string].value_counts())

{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}      6717
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}      5181
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}       4269
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}       4199
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}      3297
                                                                                                                                                 

In [9]:
# If a column has a 'u' as the first character, we will need to remove the 'u'

att_df.loc[3, 'Alcohol'][0]

'u'

In [10]:
# We will remove the leading 'u' with this reassignment

att_df.loc[3, 'Alcohol'][1:]

"'none'"

In [11]:
# Cleans the leading u's from the following columns

leading_u_columns = ['Alcohol', 'NoiseLevel', 'WiFi', 'RestaurantsAttire', 'Smoking', 'BYOBCorkage']

for i in range(len(att_df)):
  for col in leading_u_columns:
    if type(att_df.loc[i, col]) is not float and att_df.loc[i, col][0] == 'u':
      att_df.loc[i, col] = att_df.loc[i, col][1:]

In [12]:
# Check the above transformation

for col in leading_u_columns:
  print (col, att_df[att_df[col].notnull()][col].value_counts())

Alcohol 'none'             20910
'full_bar'         15992
'beer_and_wine'     6249
None                  38
Name: Alcohol, dtype: int64
NoiseLevel 'average'      26188
'quiet'         7634
'loud'          2932
'very_loud'     1200
None              39
Name: NoiseLevel, dtype: int64
WiFi 'free'    34414
'no'      21831
'paid'      619
None         50
Name: WiFi, dtype: int64
RestaurantsAttire 'casual'    38344
'dressy'      803
'formal'       70
None           38
Name: RestaurantsAttire, dtype: int64
Smoking 'no'         2405
'outdoor'    1817
'yes'         331
None           14
Name: Smoking, dtype: int64
BYOBCorkage 'no'             747
'yes_free'       590
'yes_corkage'    102
None               5
Name: BYOBCorkage, dtype: int64


### Cleaning the JSON in Columns with JSON Values

Some of the JSON keys have a leading 'u' which needs to be removed. This is because we will normalize the JSON in these columns, then concat the normalized JSON with `att_df` in order to add the information to the DataFrame.

In [13]:
check_for_u = []

for i in range(len(att_df)):
  if type(att_df.loc[i, 'Music']) is not float and att_df.loc[i, 'Music'][1] == 'u':
    check_for_u.append(i)

In [14]:
att_df.loc[check_for_u[0], 'Music']

"{u'dj': None, u'live': False, u'jukebox': None, u'video': False, u'background_music': False, u'karaoke': None, u'no_music': False}"

In [15]:
len(check_for_u)

771

In [16]:
# This is how we will clean the 'Music' column JSON string
# Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights,
# Replacement working for the following: Music, BestNights, Ambience, GoodForMeal, BusinessParking 

practice = att_df.loc[check_for_u[0], 'Music']
practice = practice.replace( ', u', ', ').replace("u'", "'")
print (practice)

{'dj': None, 'live': False, 'jukebox': None, 'video': False, 'background_music': False, 'karaoke': None, 'no_music': False}


In [17]:
# Get the indices of columns where the JSON information has u's

music_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Music']) is not float and att_df.loc[i, 'Music'][1] == 'u' 
]

best_nights_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BestNights']) is not float and att_df.loc[i, 'BestNights'][1] == 'u' 
]

ambience_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Ambience']) is not float and att_df.loc[i, 'Ambience'][1] == 'u' 
]

good_for_meal_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'GoodForMeal']) is not float and att_df.loc[i, 'GoodForMeal'][1] == 'u' 
]

business_parking_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BusinessParking']) is not float and att_df.loc[i, 'BusinessParking'][1] == 'u' 
]

In [18]:
for idx in music_u_indices:
  att_df.loc[idx, 'Music'] = att_df.loc[idx, 'Music'].replace( ', u', ', ').replace("u'", "'")

for idx in best_nights_u_indices:
  att_df.loc[idx, 'BestNights'] = att_df.loc[idx, 'BestNights'].replace( ', u', ', ').replace("u'", "'")

for idx in ambience_u_indices:
  att_df.loc[idx, 'Ambience'] = att_df.loc[idx, 'Ambience'].replace( ', u', ', ').replace("u'", "'")

for idx in good_for_meal_u_indices:
  att_df.loc[idx, 'GoodForMeal'] = att_df.loc[idx, 'GoodForMeal'].replace( ', u', ', ').replace("u'", "'")

for idx in business_parking_u_indices:
  att_df.loc[idx, 'BusinessParking'] = att_df.loc[idx, 'BusinessParking'].replace( ', u', ', ').replace("u'", "'")

In [19]:
# Comparing with the att_df['Ambience'].value_counts() to check for leading u's

att_df['Ambience'].value_counts()

{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}    6717
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}    5181
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}     4269
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}     4199
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}    3297
                                                                                                                                                           

### Normalizing the JSON in the Columns with JSON Values

Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights

In [20]:
# Selects all rows where 'Music' is not null.
# ~ is a negate operator on a boolean series

att_df[~att_df['Music'].isnull()]['Music']

28        {'dj': False, 'background_music': False, 'no_m...
46        {'dj': False, 'background_music': False, 'no_m...
47        {'dj': False, 'background_music': False, 'no_m...
61                                            {'dj': False}
62                                        {'jukebox': True}
                                ...                        
150274    {'dj': None, 'background_music': False, 'no_mu...
150275    {'dj': False, 'background_music': False, 'no_m...
150291    {'dj': False, 'background_music': False, 'no_m...
150292    {'dj': None, 'live': False, 'jukebox': None, '...
150323    {'dj': False, 'background_music': False, 'no_m...
Name: Music, Length: 7521, dtype: object

In [21]:
# Columns that have JSON values: Music, BusinessParking, Ambience, GoodForMeal, Music, BestNights
# Get the non_null indices for each of these columns
# Recall that the null values are given as np.nan, which is of type float

music_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Music']) is not float
]

business_parking_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BusinessParking']) is not float
]

ambience_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Ambience']) is not float
]

good_for_meal_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'GoodForMeal']) is not float
]

best_nights_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BestNights']) is not float
]

In [22]:
print (good_for_meal_not_null[1:15])

[12, 14, 15, 19, 23, 27, 28, 31, 33, 35, 41, 47, 53, 59]


In [23]:
print (att_df.loc[12, 'GoodForMeal'])
print (att_df.loc[16, 'GoodForMeal'])

{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}
nan


In [24]:
# Create an empty list for future dictionaries.
# We will append either an empty dictionary for None values, or a dictionary of the JSON that is present for each non-None entry
json_dicts = {
  'music': [],
  'business_parking': [],
  'ambience': [],
  'good_for_meal': [],
  'best_nights': []
}

# Have to hard-code this because can not iterate over dictionary keys by index, i.e. can not do json_dicts.keys()[0]
json_dicts_entries = ['music', 'business_parking', 'ambience', 'good_for_meal', 'best_nights']

# att_df column names must match the order of json_dicts_entries because of 'i' indexing in loop below
att_df_json_columns = ['Music', 'BusinessParking', 'Ambience', 'GoodForMeal', 'BestNights']

columns_to_clean = [
  music_not_null, 
  business_parking_not_null, 
  ambience_not_null, 
  good_for_meal_not_null, 
  best_nights_not_null
]

In [25]:
# For every entry in json_dicts_entries...
for i in range(len(json_dicts_entries)):

  # For every index in the columns to clean...
  for idx in columns_to_clean[i]:

    # If the value in att_df is None or 'None' then append {} to the list of dictionaries in json_dicts
    if att_df.loc[idx, att_df_json_columns[i]] == 'None' or att_df.loc[idx, att_df_json_columns[i]] is None:

      json_dicts[json_dicts_entries[i]].append({})

    # Otherwise, append the json_string as a dictionary to the list of dictionaries in json_dicts
    elif att_df.loc[idx, att_df_json_columns[i]] is not None:

      json_dicts[json_dicts_entries[i]].append(ast.literal_eval(att_df.loc[idx, att_df_json_columns[i]]))

In [26]:
for i in range(len(json_dicts_entries)):

  # Make the dictionary a DataFrame
  json_dicts[json_dicts_entries[i]] = pd.DataFrame(json_dicts[json_dicts_entries[i]])

  # Assign a column 'new_index' to the non-null indices
  json_dicts[json_dicts_entries[i]]['new_index'] = columns_to_clean[i]

  # Set 'new_index' as the new index of the DataFrame
  json_dicts[json_dicts_entries[i]].set_index('new_index', inplace = True)

In [27]:
# Checking an example of the DataFrames created

json_dicts['ambience']

Unnamed: 0_level_0,romantic,intimate,touristy,hipster,divey,classy,trendy,upscale,casual
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,,,,,,,,,
8,False,False,False,False,False,False,False,False,False
11,False,False,False,False,False,False,False,False,False
12,False,,,,,,,,True
14,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
150322,True,,False,False,False,,,,
150323,False,False,False,,False,False,True,False,True
150327,False,False,False,False,False,False,False,False,True
150336,,,,False,,,,,True


In [28]:
# Need to do something like this for every DataFrame in json_dicts

# att_df = pd.concat([att_df, music_df], axis = 1)