In [103]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from uszipcode import SearchEngine

In [107]:
data_dir = os.getcwd() + '/yelp_dataset'
print('Loading business data ...')
df_business = pd.read_json(os.path.join(data_dir, 'business.json'), lines=True)

Loading business data ...


In [108]:
len(df_business)

192609

In [105]:
def enrich_business_features(df):
    """
    enrich business in the US with population, population density,
    land area and median household income by searching with zipcode
    """
    # list of US state abbreviations
    states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
              "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
              "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
              "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
              "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
    # keep only businesses in the US
    df_business_us = df[df['state'].isin(states)]
    # get rid of missing zipcode items
    df_with_zipcode = df_business_us[~(df_business_us['postal_code']=='')] 
    # activate search engine which would provide more features relate to zipcode
    search = SearchEngine(simple_zipcode=True)
    df_zipcode = pd.DataFrame()
    zipcode = df_with_zipcode['postal_code'].unique()
    df_zipcode['postal_code'] = zipcode
    # perform searching and enrich features
    df_zipcode['population'] = [search.by_zipcode(i).to_dict()['population'] for i in tqdm(zipcode)]
    df_zipcode['population_density'] = [search.by_zipcode(i).to_dict()['population_density'] for i in tqdm(zipcode)]
    df_zipcode['land_area_in_sqmi'] = [search.by_zipcode(i).to_dict()['land_area_in_sqmi'] for i in tqdm(zipcode)]
    df_zipcode['median_household_income'] = [search.by_zipcode(i).to_dict()['median_household_income'] for i in tqdm(zipcode)]
    # merge features to business dataframe
    df = df_business_us.merge(df_zipcode, how='left', on='postal_code').dropna()
    
    return df

df_business = enrich_business_features(df_business)

100%|██████████| 891/891 [00:00<00:00, 916.89it/s]
100%|██████████| 891/891 [00:00<00:00, 1365.59it/s]
100%|██████████| 891/891 [00:00<00:00, 1355.74it/s]
100%|██████████| 891/891 [00:00<00:00, 1341.04it/s]


In [106]:
df_business

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,population,population_density,land_area_in_sqmi,median_household_income
1,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,42263.0,3327.0,12.70,54915.0
3,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC,24204.0,1634.0,14.81,38832.0
4,"4545 E Tropicana Rd Ste 8, Tropicana","{'RestaurantsPriceRange2': '3', 'GoodForKids':...",gbQN7vr_caG_A1ugSmGhWg,"Hair Salons, Hair Stylists, Barbers, Men's Hai...",Las Vegas,"{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'...",1,36.099872,-115.074574,Supercuts,89121,3,3.5,NV,64096.0,6969.0,9.20,40836.0
5,5940 W Union Hills Dr,"{'RestaurantsPriceRange2': '2', 'ByAppointment...",Y6iyemLX_oylRpnr38vgMA,"Nail Salons, Beauty & Spas, Day Spas",Glendale,"{'Tuesday': '12:0-18:0', 'Wednesday': '10:0-18...",0,33.654815,-112.188568,Vita Bella Fine Day Spa,85308,8,5.0,AZ,63876.0,3682.0,17.35,68796.0
6,21689 Lorain Rd,"{'ByAppointmentOnly': 'False', 'BusinessAccept...",4GBVPIYRvzGh4K4TkRQ_rw,"Beauty & Spas, Nail Salons, Day Spas, Massage",Fairview Park,"{'Tuesday': '9:0-21:0', 'Wednesday': '9:0-21:0...",1,41.440825,-81.854097,Options Salon & Spa,44126,8,4.5,OH,16771.0,3615.0,4.64,52988.0
7,2450 E Indian School Rd,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...",1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,"{'Monday': '7:0-0:0', 'Tuesday': '7:0-0:0', 'W...",1,33.495194,-112.028588,Taco Bell,85016,18,3.0,AZ,33896.0,3752.0,9.03,46233.0
8,"119 Landings Dr, Ste 101","{'BusinessParking': '{'garage': False, 'street...",5t3KVdMnFgAYmSl1wYLhmA,"Bars, Nightlife, Pubs, Barbers, Beauty & Spas,...",Mooresville,"{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",1,35.527410,-80.868003,The Kilted Buffalo Langtree,28117,9,3.5,NC,35454.0,888.0,39.91,78026.0
9,5981 Andrews Rd,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...",fweCYi8FmbJXHCqLnwuk8w,"Italian, Restaurants, Pizza, Chicken Wings",Mentor-on-the-Lake,"{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ...",1,41.708520,-81.359556,Marco's Pizza,44060,16,4.0,OH,60211.0,1563.0,38.52,64446.0
10,4145 Erie St,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...",-K4gAv8_vjx8-2BxkVeRkA,"Bakeries, Food",Willoughby,"{'Tuesday': '11:0-17:0', 'Wednesday': '11:0-17...",1,41.639860,-81.406396,Baby Cakes,44094,7,3.0,OH,35234.0,899.0,39.21,56278.0
11,"4848 E Cactus Rd, Ste 100","{'BusinessAcceptsCreditCards': 'True', 'Busine...",giC3pVVFxCRR89rApqklyw,"Hair Stylists, Beauty & Spas, Hair Salons, Men...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '9:0-19:0', '...",1,33.600071,-111.977371,Knot Salon,85254,5,5.0,AZ,45801.0,3360.0,13.63,90718.0


In [37]:
# https://pypi.org/project/uszipcode/

<function Pattern.findall(string, pos=0, endpos=9223372036854775807)>

In [109]:
96632/192606

0.5017081503172279