In [1]:
import numpy as np
import pandas as pd

In [2]:
# b_data is short for business dataset
# lines = True helps with the Trailing data ValueError

b_data = pd.read_json('../../data_projects/data/yelp/yelp_dataset/yelp_academic_dataset_business.json', lines = True)

In [3]:
b_data.drop(b_data.columns.difference(['business_id', 'attributes']), axis = 1, inplace = True)

In [4]:
b_data

Unnamed: 0,business_id,attributes
0,Pns2l4eNsfO8kk83dixA6A,{'ByAppointmentOnly': 'True'}
1,mpf3x-BjTdTEA3yCZrAYPw,{'BusinessAcceptsCreditCards': 'True'}
2,tUFrWirKiKi_TAnsVWINQQ,"{'BikeParking': 'True', 'BusinessAcceptsCredit..."
3,MTSW4McQd7CbVtyjqoe9mw,"{'RestaurantsDelivery': 'False', 'OutdoorSeati..."
4,mWMc6_wTdE0EUBKIGXDVfA,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc..."
...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"{'ByAppointmentOnly': 'False', 'RestaurantsPri..."
150342,c8GjPIOTGVmIemT7j5_SyQ,"{'BusinessAcceptsCreditCards': 'True', 'Restau..."
150343,_QAMST-NrQobXduilWEqSw,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."
150344,mtGm22y5c2UHNXDFAjaPNw,"{'BusinessParking': '{'garage': False, 'street..."


In [5]:
att_df = pd.json_normalize(b_data['attributes'].to_list())
att_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   ByAppointmentOnly           42339 non-null   object
 1   BusinessAcceptsCreditCards  119765 non-null  object
 2   BikeParking                 72638 non-null   object
 3   RestaurantsPriceRange2      85314 non-null   object
 4   CoatCheck                   5584 non-null    object
 5   RestaurantsTakeOut          59857 non-null   object
 6   RestaurantsDelivery         56282 non-null   object
 7   Caters                      40127 non-null   object
 8   WiFi                        56914 non-null   object
 9   BusinessParking             91085 non-null   object
 10  WheelchairAccessible        28953 non-null   object
 11  HappyHour                   15171 non-null   object
 12  OutdoorSeating              48802 non-null   object
 13  HasTV                       4

### Deciding Which Columns to Remove

We will be dropping some columns because the amount of information they convey relative to the amount of businesses they pertain to is small.

In [6]:
att_df.drop(
  columns = ['HairSpecializesIn', 'DietaryRestrictions', 'Open24Hours', 'AgesAllowed', 'RestaurantsCounterService', 'CoatCheck'],
  axis = 1,
  inplace = True
)

In [7]:
# These are the columns that may have values that are not booleans.

for col in att_df.columns.to_list():
    if len(att_df[att_df[col].notnull()][col].value_counts().to_list()) > 3:
        print (col)

RestaurantsPriceRange2
WiFi
BusinessParking
Alcohol
RestaurantsAttire
Ambience
NoiseLevel
GoodForMeal
Smoking
Music
BestNights
BYOBCorkage


In [8]:
# Lets us explore the values of the columns.
# Will be helpful for knowing if other columns consist of JSON data
# Columns that have leading 'u': Alcohol, NoiseLevel, WiFi, RestaurantsAttire, Smoking, BYOBCorkage
# Columns that have JSON values: BusinessParking, Ambience, GoodForMeal, Music, BestNights, HairSpecializesIn, DietaryRestrictions

column_string = 'Ambience'

print (att_df[att_df[column_string].notnull()][column_string].value_counts())

{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}      6717
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}      5181
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}       4269
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}       4199
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}      3297
                                                                                                                                                 

In [9]:
# If the first character is '{' then we the column contains JSON information
# This will have to be normalized

att_df.head().loc[2, 'BusinessParking'][0]

'{'

In [10]:
# If a column has a 'u' as the first character, we will need to remove the 'u'

att_df.loc[3, 'Alcohol'][0]

'u'

In [11]:
# We will remove the leading 'u' with this reassignment

att_df.loc[3, 'Alcohol'][1:]

"'none'"

In [12]:
leading_u_columns = ['Alcohol', 'NoiseLevel', 'WiFi', 'RestaurantsAttire', 'Smoking', 'BYOBCorkage']

for i in range(len(att_df)):
  for col in leading_u_columns:
    if type(att_df.loc[i, col]) is not float and att_df.loc[i, col][0] == 'u':
      att_df.loc[i, col] = att_df.loc[i, col][1:]

In [13]:
# Check the above transformation

for col in leading_u_columns:
  print (col, att_df[att_df[col].notnull()][col].value_counts())

Alcohol 'none'             20910
'full_bar'         15992
'beer_and_wine'     6249
None                  38
Name: Alcohol, dtype: int64
NoiseLevel 'average'      26188
'quiet'         7634
'loud'          2932
'very_loud'     1200
None              39
Name: NoiseLevel, dtype: int64
WiFi 'free'    34414
'no'      21831
'paid'      619
None         50
Name: WiFi, dtype: int64
RestaurantsAttire 'casual'    38344
'dressy'      803
'formal'       70
None           38
Name: RestaurantsAttire, dtype: int64
Smoking 'no'         2405
'outdoor'    1817
'yes'         331
None           14
Name: Smoking, dtype: int64
BYOBCorkage 'no'             747
'yes_free'       590
'yes_corkage'    102
None               5
Name: BYOBCorkage, dtype: int64


### Testing Ways to Clean the JSON for Columns with JSON Values

In [14]:
check_for_u_music = []

for i in range(len(att_df)):
  if type(att_df.loc[i, 'Music']) is not float and att_df.loc[i, 'Music'][1] == 'u':
    check_for_u_music.append(i)

In [15]:
att_df.loc[check_for_u_music[0], 'Music']

"{u'dj': None, u'live': False, u'jukebox': None, u'video': False, u'background_music': False, u'karaoke': None, u'no_music': False}"

In [16]:
len(check_for_u_music)

771

In [17]:
# This is how we will clean the 'Music' column JSON string

practice = att_df.loc[check_for_u_music[0], 'Music']
practice = practice.replace( ', u', ', ').replace("u'", "'")
print (practice)

{'dj': None, 'live': False, 'jukebox': None, 'video': False, 'background_music': False, 'karaoke': None, 'no_music': False}
