In [3]:
import pandas as pd
import numpy as np
import re

#READING DATA INTO PYTHON

The business.json file contains nested javascript objects, so it was necessary to convert the JSON file into a CSV file in order to process it in python.

In [1]:
'''
Convert Yelp Academic Dataset from JSON to CSV

Requires Pandas (https://pypi.python.org/pypi/pandas)

By Paul Butler, No Rights Reserved
'''
 
import json
import pandas as pd
from glob import glob
 
def convert(x):
    ''' Convert a json string to a flat python dictionary
    which can be passed into Pandas. '''
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob
 
for json_filename in glob('*.json'):
    csv_filename = '%s.csv' % json_filename[:-5]
    print 'Converting %s to %s' % (json_filename, csv_filename)
    df = pd.DataFrame([convert(line) for line in file(json_filename)])
    df.to_csv(csv_filename, encoding='utf-8', index=False)

Converting business.json to business.csv


In [13]:
# business.json contains nested objects that may be empty, so read_csv
# was interpreting them as mixed types and throwing an error.
# source: http://stackoverflow.com/questions/28682562/pandas-read-csv-converting-mixed-types-columns-as-string

import warnings

target_type = str  # The desired output type

with warnings.catch_warnings(record=True) as ws:
    warnings.simplefilter("always")

    business = pd.read_csv('business.csv')
    print("Warnings raised:", ws)
    # We have an error on specific columns, try and load them as string
    for w in ws:
        s = str(w.message)
        print("Warning message:", s)
        match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
        if match:
            columns = match.group(1).split(',') # Get columns as a list
            columns = [int(c) for c in columns]
            print("Applying %s dtype to columns:" % target_type, columns)
            mydata.iloc[:,columns] = mydata.iloc[:,columns].astype(target_type)



#EXAMINING THE DATA
Yelp provided a dataset with 61,184 business entries. First, let's take a look at the data provided for each business.

In [15]:
business.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61184 entries, 0 to 61183
Data columns (total 58 columns):
attributes_Accepts Credit Cards         45072 non-null object
attributes_Accepts Insurance            458 non-null object
attributes_Ages Allowed                 235 non-null object
attributes_Alcohol                      20457 non-null object
attributes_Ambience                     18547 non-null object
attributes_Attire                       20390 non-null object
attributes_BYOB                         866 non-null object
attributes_BYOB/Corkage                 1315 non-null object
attributes_By Appointment Only          7922 non-null object
attributes_Caters                       13116 non-null object
attributes_Coat Check                   3699 non-null object
attributes_Corkage                      655 non-null object
attributes_Delivery                     19924 non-null object
attributes_Dietary Restrictions         175 non-null object
attributes_Dogs Allowed             

First, we want to learn about the different types of businesses available by looking at their categories.

In [66]:
categories = business['categories'].dropna().values
unique_biz, counts = np.unique(categories, return_counts=True)
print "Unique business categories: " + str(len(unique))
print unique_biz

Unique business categories: 8046
['Accessories,Fashion,Shopping,Cosmetics & Beauty Supply,Beauty & Spas'
 'Accessories,Fashion,Shopping,Shoe Stores,Jewelry'
 'Accountants,Professional Services' ...,
 'Yoga,Physical Therapy,Health & Medical,Trainers,Fitness & Instruction,Active Life'
 'Yoga,Physical Therapy,Massage Therapy,Health & Medical,Fitness & Instruction,Active Life'
 'Yoga,Tai Chi,Adult Education,Fitness & Instruction,Active Life,Education']


As seen above, a business can claim multiple categories, so first we must determine what categories are available to examine by filtering out the duplicates. In addition, we sorted by descending count for the category to see what popular categories we can analyze.

In [81]:
# create a string containing all of the categories from each business
all_categories = ','.join(x for x in categories)

#remove duplicates
unique_cat, counts = np.unique([x for x in all_categories.split(',')], return_counts=True)
for cat, count in sorted(zip(unique_cat, counts), key=lambda x: x[1], reverse=True):
    if (count > 50):
        print cat + ' ' + str(count)

Restaurants 21892
Shopping 8919
Food 7862
Beauty & Spas 4738
Nightlife 4340
Bars 3628
Health & Medical 3213
Automotive 2965
Home Services 2853
Fashion 2566
Active Life 2470
Event Planning & Services 2467
Fast Food 2383
Pizza 2223
Mexican 2208
Local Services 2144
Hotels & Travel 2131
American (Traditional) 2113
Sandwiches 1981
Arts & Entertainment 1946
Coffee & Tea 1890
Italian 1633
Chinese 1496
American (New) 1494
Burgers 1481
Hair Salons 1388
Hotels 1307
Nail Salons 1256
Grocery 1233
Auto Repair 1220
Home & Garden 1173
Pets 1153
Breakfast & Brunch 1116
Doctors 1077
Fitness & Instruction 1068
Specialty Food 1001
Bakeries 941
Women's Clothing 916
Ice Cream & Frozen Yogurt 867
Real Estate 850
Pubs 784
Cafes 776
Dentists 752
Japanese 746
Sports Bars 713
Sushi Bars 671
Delis 649
Professional Services 640
Pet Services 634
Sporting Goods 585
Convenience Stores 578
Desserts 576
Department Stores 573
Cosmetics & Beauty Supply 563
Drugstores 561
Gyms 560
Seafood 554
Steakhouses 554
Financial Se

Similarly, we can do the same for cities to determine which has the most number of Yelp reviews.

In [74]:
cities = business['city'].values
unique, counts = np.unique(cities, return_counts=True)
print 'Unique cities: ' + str(len(unique))
for city, count in sorted(zip(unique, counts), key=lambda x: x[1], reverse=True):
    if (count > 50):
        print str(city) + ' ' + str(count)

Unique cities: 378
Las Vegas 13601
Phoenix 8410
Charlotte 4224
Scottsdale 4039
Edinburgh 3031
Pittsburgh 2724
Mesa 2347
Tempe 2258
Henderson 2130
Montréal 1870
Chandler 1867
Madison 1758
Montreal 1384
Glendale 1377
Gilbert 1263
Karlsruhe 806
Peoria 688
North Las Vegas 616
Surprise 448
Champaign 398
Goodyear 354
Avondale 299
Matthews 272
Queen Creek 236
Urbana 213
Waterloo 200
Cave Creek 180
Fort Mill 166
Middleton 166
Pineville 137
Kitchener 134
Fountain Hills 129
Concord 120
Laval 120
Apache Junction 116
Casa Grande 114
Maricopa 111
Fitchburg 109
Paradise Valley 93
Buckeye 92
Litchfield Park 86
Sun Prairie 85
Sun City 84
Anthem 83
Verdun 83
Homestead 71
Monona 71
Laveen 67
Tolleson 61
Wickenburg 58
Belmont 54
Ettlingen 51


In [76]:
idx = business.categories.str.contains(r".*[Pp]izza.*", regex=True, na=False)
biz_pizza = business[idx]

In [79]:
cities = biz_pizza['city'].values
print len(cities)
unique, counts = np.unique(cities, return_counts=True)
print 'Unique cities: ' + str(len(unique))
for city, count in sorted(zip(unique, counts), key=lambda x: x[1], reverse=True):
    #if (count > 50):
    print str(city) + ' ' + str(count)

2223
Unique cities: 128
Las Vegas 394
Phoenix 293
Pittsburgh 164
Charlotte 156
Scottsdale 120
Mesa 85
Tempe 72
Glendale 64
Madison 63
Henderson 62
Chandler 57
Montreal 55
Gilbert 51
Edinburgh 40
Montréal 40
Peoria 40
Karlsruhe 39
North Las Vegas 30
Surprise 23
Goodyear 19
Champaign 18
Avondale 15
Matthews 15
Queen Creek 15
Urbana 13
Sun Prairie 11
Fountain Hills 10
Verdun 10
Buckeye 9
Cave Creek 9
Laval 9
Fort Mill 8
Waterloo 8
Anthem 7
Kitchener 7
Maricopa 7
Casa Grande 6
Ettlingen 5
Fitchburg 5
Harrisburg 5
Lasalle 5
Litchfield Park 5
Middleton 5
Monona 5
Apache Junction 4
Belmont 4
Brossard 4
El Mirage 4
Laveen 4
Saint-Laurent 4
Waunakee 4
West Mifflin 4
Bellevue 3
Carnegie 3
Gold Canyon 3
Homestead 3
Kirkland 3
Mint Hill 3
Sun City 3
Tolleson 3
Verona 3
Wickenburg 3
Anjou 2
Carefree 2
Dollard-des-Ormeaux 2
Dorval 2
Florence 2
Gila Bend 2
Indian Trail 2
Mc Farland 2
McKees Rocks 2
Outremont 2
Paradise 2
Sainte-Anne-De-Bellevue 2
San Tan Valley 2
Sun City West 2
Swissvale 2
Weingarte

In [75]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("theme/custom.css", "r").read()
    return HTML(styles)
css_styling()