In [1]:
import numpy as np
import pandas as pd
import re
import ast
import sqlalchemy as sa
import os

from dotenv import load_dotenv

In [2]:
# b_data is short for business dataset
# lines = True helps with the Trailing data ValueError

b_data = pd.read_json('../../data_projects/data/yelp/yelp_dataset/yelp_academic_dataset_business.json', lines = True)

In [3]:
b_data.drop(b_data.columns.difference(['business_id', 'attributes']), axis = 1, inplace = True)

In [4]:
b_data

Unnamed: 0,business_id,attributes
0,Pns2l4eNsfO8kk83dixA6A,{'ByAppointmentOnly': 'True'}
1,mpf3x-BjTdTEA3yCZrAYPw,{'BusinessAcceptsCreditCards': 'True'}
2,tUFrWirKiKi_TAnsVWINQQ,"{'BikeParking': 'True', 'BusinessAcceptsCredit..."
3,MTSW4McQd7CbVtyjqoe9mw,"{'RestaurantsDelivery': 'False', 'OutdoorSeati..."
4,mWMc6_wTdE0EUBKIGXDVfA,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc..."
...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"{'ByAppointmentOnly': 'False', 'RestaurantsPri..."
150342,c8GjPIOTGVmIemT7j5_SyQ,"{'BusinessAcceptsCreditCards': 'True', 'Restau..."
150343,_QAMST-NrQobXduilWEqSw,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."
150344,mtGm22y5c2UHNXDFAjaPNw,"{'BusinessParking': '{'garage': False, 'street..."


In [5]:
att_df = pd.json_normalize(b_data['attributes'].to_list())
att_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   ByAppointmentOnly           42339 non-null   object
 1   BusinessAcceptsCreditCards  119765 non-null  object
 2   BikeParking                 72638 non-null   object
 3   RestaurantsPriceRange2      85314 non-null   object
 4   CoatCheck                   5584 non-null    object
 5   RestaurantsTakeOut          59857 non-null   object
 6   RestaurantsDelivery         56282 non-null   object
 7   Caters                      40127 non-null   object
 8   WiFi                        56914 non-null   object
 9   BusinessParking             91085 non-null   object
 10  WheelchairAccessible        28953 non-null   object
 11  HappyHour                   15171 non-null   object
 12  OutdoorSeating              48802 non-null   object
 13  HasTV                       4

### Deciding Which Columns to Remove

We will be dropping some columns because the amount of information they convey relative to the amount of businesses they pertain to is small.

In [6]:
att_df.drop(
  columns = ['HairSpecializesIn', 'DietaryRestrictions', 'Open24Hours', 'AgesAllowed', 'RestaurantsCounterService', 'CoatCheck'],
  axis = 1,
  inplace = True
)

In [7]:
# These are the columns that may have values that are not booleans.

for col in att_df.columns.to_list():
    if len(att_df[att_df[col].notnull()][col].value_counts().to_list()) > 3:
        print (col)

RestaurantsPriceRange2
WiFi
BusinessParking
Alcohol
RestaurantsAttire
Ambience
NoiseLevel
GoodForMeal
Smoking
Music
BestNights
BYOBCorkage


In [8]:
# Lets us explore the values of the columns.
# Will be helpful for knowing if other columns consist of JSON data
# Columns that have leading 'u': Alcohol, NoiseLevel, WiFi, RestaurantsAttire, Smoking, BYOBCorkage
# Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights

column_string = 'Ambience'

print (att_df[att_df[column_string].notnull()][column_string].value_counts())

{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}      6717
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}      5181
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}       4269
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}       4199
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}      3297
                                                                                                                                                 

In [9]:
# If a column has a 'u' as the first character, we will need to remove the 'u'

att_df.loc[3, 'Alcohol'][0]

'u'

In [10]:
# We will remove the leading 'u' with this reassignment

att_df.loc[3, 'Alcohol'][1:]

"'none'"

In [11]:
# Cleans the leading u's from the following columns

leading_u_columns = ['Alcohol', 'NoiseLevel', 'WiFi', 'RestaurantsAttire', 'Smoking', 'BYOBCorkage']

for i in range(len(att_df)):
  for col in leading_u_columns:
    if type(att_df.loc[i, col]) is not float and att_df.loc[i, col][0] == 'u':
      att_df.loc[i, col] = att_df.loc[i, col][1:]

In [12]:
# Check the above transformation

for col in leading_u_columns:
  print (col, att_df[att_df[col].notnull()][col].value_counts())

Alcohol 'none'             20910
'full_bar'         15992
'beer_and_wine'     6249
None                  38
Name: Alcohol, dtype: int64
NoiseLevel 'average'      26188
'quiet'         7634
'loud'          2932
'very_loud'     1200
None              39
Name: NoiseLevel, dtype: int64
WiFi 'free'    34414
'no'      21831
'paid'      619
None         50
Name: WiFi, dtype: int64
RestaurantsAttire 'casual'    38344
'dressy'      803
'formal'       70
None           38
Name: RestaurantsAttire, dtype: int64
Smoking 'no'         2405
'outdoor'    1817
'yes'         331
None           14
Name: Smoking, dtype: int64
BYOBCorkage 'no'             747
'yes_free'       590
'yes_corkage'    102
None               5
Name: BYOBCorkage, dtype: int64


### Cleaning the JSON in Columns with JSON Values

Some of the JSON keys have a leading 'u' which needs to be removed. This is because we will normalize the JSON in these columns, then concat the normalized JSON with `att_df` in order to add the information to the DataFrame.

In [13]:
check_for_u = []

for i in range(len(att_df)):
  if type(att_df.loc[i, 'Music']) is not float and att_df.loc[i, 'Music'][1] == 'u':
    check_for_u.append(i)

In [14]:
att_df.loc[check_for_u[0], 'Music']

"{u'dj': None, u'live': False, u'jukebox': None, u'video': False, u'background_music': False, u'karaoke': None, u'no_music': False}"

In [15]:
len(check_for_u)

771

In [16]:
# This is how we will clean the 'Music' column JSON string
# Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights,
# Replacement working for the following: Music, BestNights, Ambience, GoodForMeal, BusinessParking 

practice = att_df.loc[check_for_u[0], 'Music']
practice = practice.replace( ', u', ', ').replace("u'", "'")
print (practice)

{'dj': None, 'live': False, 'jukebox': None, 'video': False, 'background_music': False, 'karaoke': None, 'no_music': False}


In [17]:
# Get the indices of columns where the JSON information has u's

music_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Music']) is not float and att_df.loc[i, 'Music'][1] == 'u' 
]

best_nights_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BestNights']) is not float and att_df.loc[i, 'BestNights'][1] == 'u' 
]

ambience_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Ambience']) is not float and att_df.loc[i, 'Ambience'][1] == 'u' 
]

good_for_meal_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'GoodForMeal']) is not float and att_df.loc[i, 'GoodForMeal'][1] == 'u' 
]

business_parking_u_indices = [ 
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BusinessParking']) is not float and att_df.loc[i, 'BusinessParking'][1] == 'u' 
]

In [18]:
for idx in music_u_indices:
  att_df.loc[idx, 'Music'] = att_df.loc[idx, 'Music'].replace( ', u', ', ').replace("u'", "'")

for idx in best_nights_u_indices:
  att_df.loc[idx, 'BestNights'] = att_df.loc[idx, 'BestNights'].replace( ', u', ', ').replace("u'", "'")

for idx in ambience_u_indices:
  att_df.loc[idx, 'Ambience'] = att_df.loc[idx, 'Ambience'].replace( ', u', ', ').replace("u'", "'")

for idx in good_for_meal_u_indices:
  att_df.loc[idx, 'GoodForMeal'] = att_df.loc[idx, 'GoodForMeal'].replace( ', u', ', ').replace("u'", "'")

for idx in business_parking_u_indices:
  att_df.loc[idx, 'BusinessParking'] = att_df.loc[idx, 'BusinessParking'].replace( ', u', ', ').replace("u'", "'")

In [19]:
# Comparing with the att_df['Ambience'].value_counts() to check for leading u's

att_df['Ambience'].value_counts()

{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}    6717
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}    5181
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}     4269
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}     4199
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}    3297
                                                                                                                                                           

### Normalizing the JSON in the Columns with JSON Values

Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights

In [20]:
# Selects all rows where 'Music' is not null.
# ~ is a negate operator on a boolean series

att_df[~att_df['Music'].isnull()]['Music']

28        {'dj': False, 'background_music': False, 'no_m...
46        {'dj': False, 'background_music': False, 'no_m...
47        {'dj': False, 'background_music': False, 'no_m...
61                                            {'dj': False}
62                                        {'jukebox': True}
                                ...                        
150274    {'dj': None, 'background_music': False, 'no_mu...
150275    {'dj': False, 'background_music': False, 'no_m...
150291    {'dj': False, 'background_music': False, 'no_m...
150292    {'dj': None, 'live': False, 'jukebox': None, '...
150323    {'dj': False, 'background_music': False, 'no_m...
Name: Music, Length: 7521, dtype: object

In [21]:
# Columns that have JSON values: Music, BusinessParking, Ambience, GoodForMeal, Music, BestNights
# Get the non_null indices for each of these columns
# Recall that the null values are given as np.nan, which is of type float

music_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Music']) is not float
]

business_parking_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BusinessParking']) is not float
]

ambience_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'Ambience']) is not float
]

good_for_meal_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'GoodForMeal']) is not float
]

best_nights_not_null = [
  i for i in range(len(att_df)) if type(att_df.loc[i, 'BestNights']) is not float
]

In [22]:
print (good_for_meal_not_null[1:15])

[12, 14, 15, 19, 23, 27, 28, 31, 33, 35, 41, 47, 53, 59]


In [23]:
print (att_df.loc[12, 'GoodForMeal'])
print (att_df.loc[16, 'GoodForMeal'])

{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}
nan


In [24]:
# Create an empty list for future dictionaries.
# We will append either an empty dictionary for None values, or a dictionary of the JSON that is present for each non-None entry
json_dicts = {
  'music': [],
  'business_parking': [],
  'ambience': [],
  'good_for_meal': [],
  'best_nights': []
}

# Have to hard-code this because can not iterate over dictionary keys by index, i.e. can not do json_dicts.keys()[0]
json_dicts_entries = ['music', 'business_parking', 'ambience', 'good_for_meal', 'best_nights']

# att_df column names must match the order of json_dicts_entries because of 'i' indexing in loop below
att_df_json_columns = ['Music', 'BusinessParking', 'Ambience', 'GoodForMeal', 'BestNights']

columns_to_clean = [
  music_not_null, 
  business_parking_not_null, 
  ambience_not_null, 
  good_for_meal_not_null, 
  best_nights_not_null
]

In [25]:
# For every entry in json_dicts_entries...
for i in range(len(json_dicts_entries)):

  # For every index in the columns to clean...
  for idx in columns_to_clean[i]:

    # If the value in att_df is None or 'None' then append {} to the list of dictionaries in json_dicts
    if att_df.loc[idx, att_df_json_columns[i]] == 'None' or att_df.loc[idx, att_df_json_columns[i]] is None:

      json_dicts[json_dicts_entries[i]].append({})

    # Otherwise, append the json_string as a dictionary to the list of dictionaries in json_dicts
    elif att_df.loc[idx, att_df_json_columns[i]] is not None:

      json_dicts[json_dicts_entries[i]].append(ast.literal_eval(att_df.loc[idx, att_df_json_columns[i]]))

In [26]:
for i in range(len(json_dicts_entries)):

  # Make the dictionary a DataFrame
  json_dicts[json_dicts_entries[i]] = pd.DataFrame(json_dicts[json_dicts_entries[i]])

  # Assign a column 'new_index' to the non-null indices
  json_dicts[json_dicts_entries[i]]['new_index'] = columns_to_clean[i]

  # Set 'new_index' as the new index of the DataFrame
  json_dicts[json_dicts_entries[i]].set_index('new_index', inplace = True)

In [27]:
# Checking an example of the DataFrames created

json_dicts['ambience']

Unnamed: 0_level_0,romantic,intimate,touristy,hipster,divey,classy,trendy,upscale,casual
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,,,,,,,,,
8,False,False,False,False,False,False,False,False,False
11,False,False,False,False,False,False,False,False,False
12,False,,,,,,,,True
14,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
150322,True,,False,False,False,,,,
150323,False,False,False,,False,False,True,False,True
150327,False,False,False,False,False,False,False,False,True
150336,,,,False,,,,,True


### Concatenating the DataFrames in `json_dicts` to `att_df`

In [28]:
for key in json_dicts_entries:

  # For every DataFrame in the json_dicts, concat the DataFrame to att_df
  att_df = pd.concat([att_df, json_dicts[key]], axis = 1)

att_df.info()  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150346 entries, 0 to 150345
Data columns (total 67 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   ByAppointmentOnly           42339 non-null   object
 1   BusinessAcceptsCreditCards  119765 non-null  object
 2   BikeParking                 72638 non-null   object
 3   RestaurantsPriceRange2      85314 non-null   object
 4   RestaurantsTakeOut          59857 non-null   object
 5   RestaurantsDelivery         56282 non-null   object
 6   Caters                      40127 non-null   object
 7   WiFi                        56914 non-null   object
 8   BusinessParking             91085 non-null   object
 9   WheelchairAccessible        28953 non-null   object
 10  HappyHour                   15171 non-null   object
 11  OutdoorSeating              48802 non-null   object
 12  HasTV                       45084 non-null   object
 13  RestaurantsReservations     4

In [29]:
att_df.columns

Index(['ByAppointmentOnly', 'BusinessAcceptsCreditCards', 'BikeParking',
       'RestaurantsPriceRange2', 'RestaurantsTakeOut', 'RestaurantsDelivery',
       'Caters', 'WiFi', 'BusinessParking', 'WheelchairAccessible',
       'HappyHour', 'OutdoorSeating', 'HasTV', 'RestaurantsReservations',
       'DogsAllowed', 'Alcohol', 'GoodForKids', 'RestaurantsAttire',
       'Ambience', 'RestaurantsTableService', 'RestaurantsGoodForGroups',
       'DriveThru', 'NoiseLevel', 'GoodForMeal', 'BusinessAcceptsBitcoin',
       'Smoking', 'Music', 'GoodForDancing', 'AcceptsInsurance', 'BestNights',
       'BYOB', 'Corkage', 'BYOBCorkage', 'dj', 'background_music', 'no_music',
       'jukebox', 'live', 'video', 'karaoke', 'garage', 'street', 'validated',
       'lot', 'valet', 'romantic', 'intimate', 'touristy', 'hipster', 'divey',
       'classy', 'trendy', 'upscale', 'casual', 'dessert', 'latenight',
       'lunch', 'dinner', 'brunch', 'breakfast', 'monday', 'tuesday', 'friday',
       'wednesday', '

### Renaming and Reordering `att_df` Column Names

In [30]:
# Converts the CamelCase formatted column names to snake_case
def cap_to_snake(column_name): 
  return column_name[0].lower() + re.sub(r'(?!^)[A-Z]', lambda x: '_' + x.group(0).lower(), column_name[1:])

# Create a list for the new column names
new_column_names = []

# Append each reformatted column name to new_column_names
for column in att_df.columns:
  new_column_names.append(cap_to_snake(column))

# Rename the columns
att_df.columns = new_column_names

# Check the transformation
att_df.columns

Index(['by_appointment_only', 'business_accepts_credit_cards', 'bike_parking',
       'restaurants_price_range2', 'restaurants_take_out',
       'restaurants_delivery', 'caters', 'wi_fi', 'business_parking',
       'wheelchair_accessible', 'happy_hour', 'outdoor_seating', 'has_t_v',
       'restaurants_reservations', 'dogs_allowed', 'alcohol', 'good_for_kids',
       'restaurants_attire', 'ambience', 'restaurants_table_service',
       'restaurants_good_for_groups', 'drive_thru', 'noise_level',
       'good_for_meal', 'business_accepts_bitcoin', 'smoking', 'music',
       'good_for_dancing', 'accepts_insurance', 'best_nights', 'bY_o_b',
       'corkage', 'bY_o_b_corkage', 'dj', 'background_music', 'no_music',
       'jukebox', 'live', 'video', 'karaoke', 'garage', 'street', 'validated',
       'lot', 'valet', 'romantic', 'intimate', 'touristy', 'hipster', 'divey',
       'classy', 'trendy', 'upscale', 'casual', 'dessert', 'latenight',
       'lunch', 'dinner', 'brunch', 'breakfast', 'm

In [31]:
# Still need to rename some of the columns. Will target these columns with a dictionary

clean_columns_dict = {
  'restaurants_price_range2': 'restaurants_price_range',
  'wi_fi': 'wifi',
  'has_t_v': 'has_tv',
  'bY_o_b': 'byob',
  'bY_o_b_corkage': 'byob_corkage',
  'garage': 'parking_garage',
  'street': 'street_parking',
  'validated': 'validated_parking',
  'lot': 'parking_lot',
  'valet': 'valet_parking',
  'dessert': 'best_for_desert',
  'latenight': 'best_for_latenight',
  'lunch': 'best_for_lunch',
  'dinner': 'best_for_dinner',
  'brunch': 'best_for_brunch',
  'breakfast': 'best_for_breakfast',
  'monday': 'best_night_monday', 
  'tuesday': 'best_night_tuesday', 
  'wednesday': 'best_night_wednesday',
  'thursday': 'best_night_thursday',
  'friday': 'best_night_friday',
  'saturday': 'best_night_saturday',
  'sunday': 'best_night_sunday'
}

att_df.rename(columns = clean_columns_dict, inplace = True)

# Checking the renames
att_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150346 entries, 0 to 150345
Data columns (total 67 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   by_appointment_only            42339 non-null   object
 1   business_accepts_credit_cards  119765 non-null  object
 2   bike_parking                   72638 non-null   object
 3   restaurants_price_range        85314 non-null   object
 4   restaurants_take_out           59857 non-null   object
 5   restaurants_delivery           56282 non-null   object
 6   caters                         40127 non-null   object
 7   wifi                           56914 non-null   object
 8   business_parking               91085 non-null   object
 9   wheelchair_accessible          28953 non-null   object
 10  happy_hour                     15171 non-null   object
 11  outdoor_seating                48802 non-null   object
 12  has_tv                         45084 non-nul

In [32]:
# Need to reorder the best_of_{day of week} columns to reflect the normal ordering of the days
# Need to wrap the single items in [] in order to make the single string a list

reorder = att_df.columns[0:62].to_list() + att_df.columns[63:65].to_list() + [att_df.columns[62]] + [att_df.columns[66]] + [att_df.columns[65]]
att_df = att_df[reorder]

att_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150346 entries, 0 to 150345
Data columns (total 67 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   by_appointment_only            42339 non-null   object
 1   business_accepts_credit_cards  119765 non-null  object
 2   bike_parking                   72638 non-null   object
 3   restaurants_price_range        85314 non-null   object
 4   restaurants_take_out           59857 non-null   object
 5   restaurants_delivery           56282 non-null   object
 6   caters                         40127 non-null   object
 7   wifi                           56914 non-null   object
 8   business_parking               91085 non-null   object
 9   wheelchair_accessible          28953 non-null   object
 10  happy_hour                     15171 non-null   object
 11  outdoor_seating                48802 non-null   object
 12  has_tv                         45084 non-nul

### Preparing Data for SQL

In [33]:
# All entries in att_df are currently strings, so we need to make them booleans where applicable.
# First, need a list of columns that contain the values True, False, or None

bool_cols = []

for col in (att_df.columns.to_list()):
  if len(att_df[col].value_counts()) <= 3:
    bool_cols.append(col)

print (bool_cols)

['by_appointment_only', 'business_accepts_credit_cards', 'bike_parking', 'restaurants_take_out', 'restaurants_delivery', 'caters', 'wheelchair_accessible', 'happy_hour', 'outdoor_seating', 'has_tv', 'restaurants_reservations', 'dogs_allowed', 'good_for_kids', 'restaurants_table_service', 'restaurants_good_for_groups', 'drive_thru', 'business_accepts_bitcoin', 'good_for_dancing', 'accepts_insurance', 'byob', 'corkage', 'dj', 'background_music', 'no_music', 'jukebox', 'live', 'video', 'karaoke', 'parking_garage', 'street_parking', 'validated_parking', 'parking_lot', 'valet_parking', 'romantic', 'intimate', 'touristy', 'hipster', 'divey', 'classy', 'trendy', 'upscale', 'casual', 'best_for_desert', 'best_for_latenight', 'best_for_lunch', 'best_for_dinner', 'best_for_brunch', 'best_for_breakfast', 'best_night_monday', 'best_night_tuesday', 'best_night_wednesday', 'best_night_thursday', 'best_night_friday', 'best_night_saturday', 'best_night_sunday']


In [34]:
# Verifying that the values for True and False are not 'True' or 'False' i.e. string representations of the booleans

check_column = 'business_accepts_credit_cards'

att_df[att_df[check_column] == 'True'][check_column]

1         True
2         True
4         True
5         True
6         True
          ... 
150339    True
150342    True
150343    True
150344    True
150345    True
Name: business_accepts_credit_cards, Length: 113667, dtype: object

In [35]:
att_df[att_df[check_column] == 'False'][check_column]

3         False
24        False
50        False
62        False
70        False
          ...  
150177    False
150238    False
150287    False
150301    False
150310    False
Name: business_accepts_credit_cards, Length: 6025, dtype: object

In [36]:
# Still 'True' and 'False'
# We will convert 'True' to 1, 'False' to 0, and 'None' or None to np.nan so the columns can be properly formatted for SQLAlchemy
# Warning: This cell took 4m 55sec to run

for col in bool_cols:

  for i in range(len(att_df)):

    if att_df.loc[i, col] == 'True':
      att_df.loc[i, col] = 1

    elif att_df.loc[i, col] == 'False':
      att_df.loc[i, col] = 0
      
    elif att_df.loc[i, col] == None or att_df.loc[i, col] == 'None':
      att_df.loc[i, col] = np.nan    

In [37]:
# Give all the boolean columns the sa.types.Boolean() datatype for future Postgres table
bool_dict = { col: sa.types.Boolean() for col in bool_cols }

In [38]:
# Checking how many more columns need an sa.types value
print (len(list(bool_dict.keys())))
print (len(list(att_df.columns)))

55
67


In [39]:
# Still need to confirm datatypes for the following columns
for col in att_df.columns:
  if col not in bool_cols:
    print (col)

restaurants_price_range
wifi
business_parking
alcohol
restaurants_attire
ambience
noise_level
good_for_meal
smoking
music
best_nights
byob_corkage


In [40]:
# Recall that json_dicts_entries = ['music', 'business_parking', 'ambience', 'good_for_meal', 'best_nights']
# We can now drop these columns because the JSON in them is now represented as columns in att_df

att_df.drop(json_dicts_entries, axis = 1, inplace = True)
att_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150346 entries, 0 to 150345
Data columns (total 62 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   by_appointment_only            42299 non-null   object
 1   business_accepts_credit_cards  119692 non-null  object
 2   bike_parking                   72558 non-null   object
 3   restaurants_price_range        85314 non-null   object
 4   restaurants_take_out           57752 non-null   object
 5   restaurants_delivery           52334 non-null   object
 6   caters                         40067 non-null   object
 7   wifi                           56914 non-null   object
 8   wheelchair_accessible          28926 non-null   object
 9   happy_hour                     15169 non-null   object
 10  outdoor_seating                46920 non-null   object
 11  has_tv                         45065 non-null   object
 12  restaurants_reservations       44959 non-nul

In [41]:
for col in att_df.columns:
  if col not in bool_cols:
    print (col)

restaurants_price_range
wifi
alcohol
restaurants_attire
noise_level
smoking
byob_corkage


In [42]:
# Replacing 'None' with None where applicable
string_cols = [ 'wifi', 'alcohol', 'restaurants_attire', 'noise_level', 'smoking', 'byob_corkage' ]

for col in string_cols:
  for i in range(len(att_df)):
    if att_df.loc[i, col] == 'None':
      att_df.loc[i, col] = None

In [43]:
# Checking the transformation
att_df['wifi']

0            NaN
1            NaN
2           'no'
3         'free'
4            NaN
           ...  
150341       NaN
150342      'no'
150343       NaN
150344       NaN
150345    'free'
Name: wifi, Length: 150346, dtype: object

In [44]:
# Checking the transformation
# The numbers come from the values from the value_counts() statement
print (att_df['wifi'].value_counts())
print (34414 + 21831 + 619)

'free'    34414
'no'      21831
'paid'      619
Name: wifi, dtype: int64
56864


In [45]:
att_df[att_df['wifi'].isnull()]['wifi']

0         NaN
1         NaN
4         NaN
6         NaN
7         NaN
         ... 
150339    NaN
150340    NaN
150341    NaN
150343    NaN
150344    NaN
Name: wifi, Length: 93482, dtype: object

In [46]:
# This still equals the total number of rows
93482 + 56864

150346

In [47]:
string_dict = { col: sa.types.Text() for col in string_cols }
string_dict

{'wifi': Text(),
 'alcohol': Text(),
 'restaurants_attire': Text(),
 'noise_level': Text(),
 'smoking': Text(),
 'byob_corkage': Text()}

In [48]:
# restaurants_price_range still has 'None' values
# Replace these with np.nan, so we can still assign sa.types.Integer() in SQLAlchemy

rest_price_range_nones = [ i for i in range(len(att_df)) if att_df.loc[i, 'restaurants_price_range'] == 'None' ]

for idx in rest_price_range_nones:
  att_df.loc[idx, 'restaurants_price_range'] = np.nan

In [49]:
# Combines string_dict and bool_dict to one dictionary

dtypes_dict = { **string_dict, **bool_dict }

In [50]:
dtypes_dict['restaurants_price_range'] = sa.types.Integer()

In [51]:
print (len(dtypes_dict.keys()))
print (len(att_df.columns))

62
62


### Appending `business_id` to `att_df`

Recall that because the goal is a create a Postgres table with attribute data, we will need to add the `business_id` information to `att_df` so we have a foreign key in Postgres.

In [52]:
for_sql = pd.concat([b_data['business_id'], att_df], axis = 1)

In [53]:
for_sql.head()

Unnamed: 0,business_id,by_appointment_only,business_accepts_credit_cards,bike_parking,restaurants_price_range,restaurants_take_out,restaurants_delivery,caters,wifi,wheelchair_accessible,...,best_for_dinner,best_for_brunch,best_for_breakfast,best_night_monday,best_night_tuesday,best_night_wednesday,best_night_thursday,best_night_friday,best_night_saturday,best_night_sunday
0,Pns2l4eNsfO8kk83dixA6A,1.0,,,,,,,,,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,,1.0,,,,,,,,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,0.0,1.0,1.0,2.0,0.0,0.0,0.0,'no',1.0,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,0.0,0.0,1.0,1.0,1.0,0.0,1.0,'free',,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,,1.0,1.0,,1.0,,0.0,,1.0,...,,,,,,,,,,


In [54]:
# Adding business_id to dtypes_dict
dtypes_dict['business_id'] = sa.types.Text()

### Checking Values Before Creating Postgres Table

Given the data transformations, we will have to visually inspect the data in order to check for validity. The two cells below can be used to check values of the table to see if they match.

In [55]:
check_index = 11034

for_sql.loc[check_index][for_sql.loc[check_index].notnull()]

business_id                      lXCFcmhoRsyW-mnzzl6fkA
business_accepts_credit_cards                         1
bike_parking                                          1
restaurants_price_range                               2
restaurants_take_out                                  1
restaurants_delivery                                  0
caters                                                1
wifi                                             'free'
outdoor_seating                                       0
has_tv                                                1
restaurants_reservations                              0
alcohol                                 'beer_and_wine'
good_for_kids                                         1
restaurants_attire                             'casual'
restaurants_good_for_groups                           1
noise_level                                   'average'
parking_garage                                    False
street_parking                                  

In [56]:
print (b_data.loc[check_index, 'business_id'])
b_data.loc[check_index, 'attributes']

lXCFcmhoRsyW-mnzzl6fkA


{'RestaurantsAttire': "u'casual'",
 'Caters': 'True',
 'BusinessAcceptsCreditCards': 'True',
 'RestaurantsReservations': 'False',
 'Alcohol': "u'beer_and_wine'",
 'RestaurantsGoodForGroups': 'True',
 'HasTV': 'True',
 'BikeParking': 'True',
 'RestaurantsPriceRange2': '2',
 'Ambience': "{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}",
 'GoodForKids': 'True',
 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'brunch': False, 'breakfast': False}",
 'RestaurantsTakeOut': 'True',
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'NoiseLevel': "u'average'",
 'RestaurantsDelivery': 'False',
 'OutdoorSeating': 'False',
 'WiFi': "u'free'"}

### Pushing the Data to a Postgres Table

In [57]:
load_dotenv()

engine = sa.create_engine(os.getenv('ENGINE'))

for_sql.to_sql(
  'business_attributes',
  engine,
  if_exists = 'replace', # This will drop the table if it already exists. Delete this line if necessary.
  index = False,
  dtype = dtypes_dict
)

In [60]:
# Makes business_id the foreign key

engine.execute('''
  ALTER TABLE business_attributes
    ADD CONSTRAINT business_id FOREIGN KEY (business_id) REFERENCES businesses (business_id)
''')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fe562401eb0>