In [1]:
import os
import pandas as pd

dataset_path = 'yelp_dataset'

In [2]:
businesses_df = pd.read_json(f'{dataset_path}/yelp_academic_dataset_business.json', lines=True)
businesses_df.shape

(150346, 14)

In [3]:
# Open vs closed business
businesses_df.is_open.value_counts()

is_open
1    119698
0     30648
Name: count, dtype: int64

In [4]:
# Filter open businesses for our analysis
open_businesses_df = businesses_df[businesses_df.is_open == 1]
open_businesses_df.shape

(119698, 14)

### Text Formating
#### Business names that contain location info

In [5]:
open_businesses_df[open_businesses_df.name.str.contains(' - ')].head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
30,fvWn8oXXwbj2l79cochZyw,Altitude Trampoline Park - Boise,1301 N Milwaukee St,Boise,ID,83704,43.616763,-116.285382,5.0,30,1,"{'BusinessParking': '{'garage': False, 'street...","Trampoline Parks, Active Life","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'..."
142,GWGXTKR0Fhdvzf_isDqJug,"David Gower, Jr. - Coldwell Banker Preferred","325 Chestnut St, Ste 1300",Philadelphia,PA,19106,39.949027,-75.14719,5.0,17,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Real Estate, Home Services, Real Estate Agents","{'Monday': '9:0-19:0', 'Tuesday': '9:0-19:0', ..."
149,h-y5azB-VlQAT3m7Ff2g2Q,P's & Q's - Premium Quality,820 S St,Philadelphia,PA,19147,39.942515,-75.156468,5.0,16,1,"{'BusinessParking': '{'garage': False, 'street...","Fashion, Shopping, Men's Clothing",
202,pmuuoDqNZp7518AUd-YmPA,Bagelicious - King Of Prussia,216 W Beidler Rd,King of Prussia,PA,19406,40.112481,-75.379975,3.5,60,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Bakeries, Caterers, Bagels, Food,...","{'Tuesday': '7:0-13:0', 'Wednesday': '7:0-13:0..."
211,Nh1rc9aSeO-Y5lYmXVS6CA,Quality Inn Nashville Downtown - Stadium,303 Interstate Dr,Nashville,TN,37213,36.168834,-86.766911,2.0,53,1,"{'WiFi': ''free'', 'BusinessAcceptsCreditCards...","Hotels & Travel, Event Planning & Services, Ho...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


#### Duplicates
The cell below shows businesses with the same name and location details that we need to treat. We decide on which businesses we need to delete once we have more information (review details) or other details on why we have duplicate businesses.

In [38]:
# Are there any duplicate businesses? same name, same location
duplicates = open_businesses_df.duplicated(subset=['name', 'address', 'city', 'latitude', 'longitude', 'state', 'is_open'], keep=False)

print(is_open_businesses[duplicates].shape)
is_open_businesses[duplicates].sort_values('name').head(20)

(18, 14)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
29874,H55R8ODu9aa51gWnTHLRTw,Independent Automotive,865 Bergin Way,Sparks,NV,89431,39.526564,-119.739034,4.0,5,1,{'BusinessAcceptsCreditCards': 'True'},"Automotive, Auto Repair, Smog Check Stations",
102015,rlwLZhLI7Q8bN3DlSrzW7A,Independent Automotive,865 Bergin Way,Sparks,NV,89431,39.526564,-119.739034,4.5,43,1,{'BusinessAcceptsCreditCards': 'True'},"Oil Change Stations, Smog Check Stations, Auto...","{'Monday': '8:0-17:30', 'Tuesday': '8:0-17:30'..."
25025,YT1ZJrFHgwbeZvzOqSe1LA,Kroger,3410 Gallatin Pike,Nashville,TN,37216,36.210166,-86.732522,1.5,7,1,,"Shopping, Drugstores",
126374,BMcyBSKVkaCfKqz-STbvrw,Kroger,3410 Gallatin Pike,Nashville,TN,37216,36.210166,-86.732522,2.5,41,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Grocery, Food","{'Monday': '6:0-23:0', 'Tuesday': '6:0-23:0', ..."
74227,MOay1hDY4AUnnj_6rXHcng,Lake Lawn Metairie Funeral Home & Cemeteries,5100 Pontchartrain Blvd,New Orleans,LA,70124,29.982484,-90.114553,4.5,39,1,{'BusinessAcceptsCreditCards': 'True'},"Local Flavor, Local Services, Funeral Services...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
63945,1rC8U1kWIlgkJ33h_MrRpg,Lake Lawn Metairie Funeral Home & Cemeteries,5100 Pontchartrain Blvd,New Orleans,LA,70124,29.982484,-90.114553,5.0,17,1,{'BusinessAcceptsCreditCards': 'True'},"Funeral Services & Cemeteries, Public Services...",
113827,aS3iZvJaD8DFnp5Cl7qMWg,Lakeview Grocery,801 Harrison Ave,New Orleans,LA,70124,30.005173,-90.105275,3.5,48,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Grocery, Food","{'Monday': '7:0-21:0', 'Tuesday': '7:0-21:0', ..."
92261,oIozEyiTSYh0T_YQRfpm5A,Lakeview Grocery,801 Harrison Ave,New Orleans,LA,70124,30.005173,-90.105275,3.5,13,1,"{'RestaurantsReservations': 'False', 'Restaura...","Restaurants, Seafood",
11148,SvymuQGYaUSqXmvh5HiPLg,Logan Inn,10 W Ferry St,New Hope,PA,18938,40.363467,-74.95145,3.0,111,1,"{'OutdoorSeating': 'True', 'WheelchairAccessib...","American (New), Restaurants, Seafood, Event Pl...","{'Wednesday': '16:0-0:0', 'Thursday': '16:0-0:..."
57209,8Ck78FmeRO7W9iy1-5eCeg,Logan Inn,10 W Ferry St,New Hope,PA,18938,40.363467,-74.95145,3.0,153,1,"{'RestaurantsDelivery': 'False', 'WiFi': 'u'fr...","Caterers, Hotels & Travel, Mediterranean, Hote...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."


In [42]:
# Fix dtypes
is_open_businesses = is_open_businesses.astype({
    'stars': 'int64', 
    'business_id': 'string', 
    'name': 'string',
    'address': 'string',
    'city': 'string',
    'state': 'string',
    'postal_code': 'string',
    'is_open': 'category'
})

is_open_businesses.dtypes

business_id     string[python]
name            string[python]
address         string[python]
city            string[python]
state           string[python]
postal_code     string[python]
latitude               float64
longitude              float64
stars                    int64
review_count             int64
is_open               category
attributes              object
categories              object
hours                   object
dtype: object

In [40]:
# Dataframe info
is_open_businesses.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119698 entries, 1 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   119698 non-null  string 
 1   name          119698 non-null  string 
 2   address       119698 non-null  string 
 3   city          119698 non-null  string 
 4   state         119698 non-null  string 
 5   postal_code   119698 non-null  string 
 6   latitude      119698 non-null  float64
 7   longitude     119698 non-null  float64
 8   stars         119698 non-null  int64  
 9   review_count  119698 non-null  int64  
 10  is_open       119698 non-null  int64  
 11  attributes    107350 non-null  object 
 12  categories    119603 non-null  object 
 13  hours         103603 non-null  object 
dtypes: float64(2), int64(3), object(3), string(6)
memory usage: 13.7+ MB


In [None]:
# Some secondaru info is missing in the data, but we won't bother just yet.

#### Future data processing items
Our analysis is on textual sentiment extraction, so we'll leave the remaining collapsed columns attributes, categories, hours unless we need them during EDA

In [33]:
# Print column dtypes
businesses_df.dtypes

# stars to numeric/int64
# how many closed businesses
# business id to text
# name to text and normalize 'correct misspellings'; if doing location-based analysis
# Need to round/normalize to get more dupicates gone?
# is_open to category?

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object