In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

## Data directory

In [3]:
# data folder path
data_directory = os.path.join('..','data','yelp_datasets')
data_directory_saves = os.path.join( '..','data','clean_data')

# data file location
businesses_dataset = os.path.join(data_directory,'yelp_academic_dataset_business.json')

In [4]:
data_directory

'../data/yelp_datasets'

In [5]:
%%time
df = pd.read_json(businesses_dataset, lines=True)

CPU times: user 6.2 s, sys: 3.44 s, total: 9.64 s
Wall time: 9.66 s


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188593 entries, 0 to 188592
Data columns (total 15 columns):
address         188593 non-null object
attributes      162807 non-null object
business_id     188593 non-null object
categories      188052 non-null object
city            188593 non-null object
hours           143791 non-null object
is_open         188593 non-null int64
latitude        188587 non-null float64
longitude       188587 non-null float64
name            188593 non-null object
neighborhood    188593 non-null object
postal_code     188593 non-null object
review_count    188593 non-null int64
stars           188593 non-null float64
state           188593 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 21.6+ MB


In [8]:
df.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV


# Change column name
- stars to star_avg

In [9]:
df = df.rename(columns={'stars': 'star_avg'})

## Drop null categories

In [10]:
len(df[df.categories.isnull()])

541

## create null categorie filter 

In [11]:
null_cat = df.categories.isnull()

In [12]:
len(df[null_cat])

541

In [13]:
len(df[~null_cat])

188052

In [14]:
df = df[~null_cat] # display rows NOT in null_cat filter

In [15]:
df = df.reset_index(drop=True)  # reset index

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188052 entries, 0 to 188051
Data columns (total 15 columns):
address         188052 non-null object
attributes      162750 non-null object
business_id     188052 non-null object
categories      188052 non-null object
city            188052 non-null object
hours           143747 non-null object
is_open         188052 non-null int64
latitude        188046 non-null float64
longitude       188046 non-null float64
name            188052 non-null object
neighborhood    188052 non-null object
postal_code     188052 non-null object
review_count    188052 non-null int64
star_avg        188052 non-null float64
state           188052 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 21.5+ MB


## State
- select only us locations
    - sorry Canada

In [17]:
df.state.value_counts(dropna=False)[:10]

AZ    56296
NV    35586
ON    32315
NC    14320
OH    13628
PA    10940
QC     8727
AB     7656
WI     5034
IL     1932
Name: state, dtype: int64

In [18]:
df.state.unique()

array(['AB', 'NV', 'QC', 'AZ', 'ON', 'PA', 'OH', 'IL', 'WI', 'NC', 'BY',
       'NYK', 'SC', 'C', 'XGM', 'ST', 'IN', 'RP', 'CMA', 'NI', 'NLK',
       'VS', '6', 'CO', 'HE', 'VA', 'RCC', '01', 'SG', 'NY', 'OR', 'NW',
       '4', '10', 'CC', 'CA', '45', 'LU', 'MT', 'G', 'PO', 'B', 'VT',
       'AL', 'WAR', 'MO', 'HU', 'M', 'AR', 'O', 'FL', 'WA', 'CRF', 'TAM',
       'NE', 'XMS', 'GA', 'AG', 'WHT', 'MA', 'V', 'BC', 'SP', 'DE', 'HH',
       '11', 'CS', 'MN'], dtype=object)

In [19]:
on = df.state == 'ON'
qc = df.state == 'QC'
ab = df.state == 'AB'
ni = df.state == 'NI'
nyk = df.state == 'NYK'

In [20]:
df = df[~(on|qc|ab|ni|nyk)]
df = df.reset_index(drop=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139057 entries, 0 to 139056
Data columns (total 15 columns):
address         139057 non-null object
attributes      119919 non-null object
business_id     139057 non-null object
categories      139057 non-null object
city            139057 non-null object
hours           108394 non-null object
is_open         139057 non-null int64
latitude        139053 non-null float64
longitude       139053 non-null float64
name            139057 non-null object
neighborhood    139057 non-null object
postal_code     139057 non-null object
review_count    139057 non-null int64
star_avg        139057 non-null float64
state           139057 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 15.9+ MB


## drop columns
- drop unused columns

In [22]:
list(df.columns)

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'star_avg',
 'state']

In [23]:
df.drop(['hours','is_open','latitude','longitude','neighborhood','postal_code','address' ,'attributes'],axis=1,inplace=True)

In [24]:
df.head()

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state
0,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV
1,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,Geico Insurance,8,1.5,AZ
2,45bWSZtniwPRiqlivpS8Og,"Coffee & Tea, Food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ
3,8-NRKkPY1UiFXW20WXKiXg,"Mexican, Restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ
4,UTm5QZThPQlT35mkAcGOjg,"Flowers & Gifts, Gift Shops, Shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139057 entries, 0 to 139056
Data columns (total 7 columns):
business_id     139057 non-null object
categories      139057 non-null object
city            139057 non-null object
name            139057 non-null object
review_count    139057 non-null int64
star_avg        139057 non-null float64
state           139057 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 7.4+ MB


In [26]:
df.head()

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state
0,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV
1,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,Geico Insurance,8,1.5,AZ
2,45bWSZtniwPRiqlivpS8Og,"Coffee & Tea, Food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ
3,8-NRKkPY1UiFXW20WXKiXg,"Mexican, Restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ
4,UTm5QZThPQlT35mkAcGOjg,"Flowers & Gifts, Gift Shops, Shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA


In [27]:
df.city.value_counts()[:10]

Las Vegas     28785
Phoenix       18552
Charlotte      9179
Scottsdale     8803
Pittsburgh     6790
Mesa           6219
Henderson      4802
Tempe          4477
Chandler       4259
Madison        3504
Name: city, dtype: int64

## Categories

In [28]:
df.categories = df.categories.str.lower()
df['categories_list'] = df.categories.str.lower()
df['categories_list']  = df['categories_list'].str.split(',')

In [29]:
df.head()

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,AjEbIBw6ZFfln7ePHha9PA,"chicken wings, burgers, caterers, street vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV,"[chicken wings, burgers, caterers, street v..."
1,bFzdJJ3wp3PZssNEsyU23g,"insurance, financial services",Phoenix,Geico Insurance,8,1.5,AZ,"[insurance, financial services]"
2,45bWSZtniwPRiqlivpS8Og,"coffee & tea, food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ,"[coffee & tea, food]"
3,8-NRKkPY1UiFXW20WXKiXg,"mexican, restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ,"[mexican, restaurants]"
4,UTm5QZThPQlT35mkAcGOjg,"flowers & gifts, gift shops, shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA,"[flowers & gifts, gift shops, shopping]"


In [30]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(df.categories_list)):
    for j in df.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 5.94 s, sys: 15.6 ms, total: 5.95 s
Wall time: 5.93 s


In [31]:
cat_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1275 entries, restaurants to makerspaces
Data columns (total 1 columns):
total_reviews    1275 non-null int64
dtypes: int64(1)
memory usage: 19.9+ KB


In [32]:
cat_reviews[:10]

Unnamed: 0,total_reviews
restaurants,34385
shopping,23189
food,17326
home services,17282
beauty & spas,14621
health & medical,14061
local services,11088
automotive,10866
nightlife,8487
event planning & services,7546


## Healthcare related
https://www.yelp.com/developers/documentation/v3/category_list
- select only healthcare related business

## out of 140, remove unrelated
- primary care
- specialist
- hospital
- clinic
- mental health


no dentist
no eyecare

In [33]:
ll = '''Behavior Analysts
Chiropractors
Counseling & Mental Health
Psychologists
Sex Therapists
Cryotherapy
Diagnostic Services
Diagnostic Imaging
Laboratory Testing
Dialysis Clinics
Doctors
Addiction Medicine
Allergists
Anesthesiologists
Audiologist
Cardiologists
Cosmetic Surgeons
Dermatologists
Ear Nose & Throat
Emergency Medicine
Endocrinologists
Family Practice
Fertility
Gastroenterologist
Geneticists
Gerontologists
Hepatologists
Hospitalists
Immunodermatologists
Infectious Disease Specialists
Internal Medicine
Neurologist
Neuropathologists
Neurotologists
Obstetricians & Gynecologists
Oncologist
Ophthalmologists
Orthopedists
Osteopathic Physicians
Otologists
Pain Management
Pathologists
Pediatricians
Phlebologists
Plastic Surgeons
Podiatrists
Preventive Medicine
Proctologists
Psychiatrists
Pulmonologist
Radiologists
Rheumatologists
Spine Surgeons
Surgeons
Toxicologists
Urologists
Vascular Medicine
Emergency Rooms
Habilitative Services
Hearing Aid Providers
Hospitals
Hypnosis/Hypnotherapy
Lactation Services
Medical Centers
Walk-in Clinics
Memory Care
Nurse Practitioner
Nutritionists
Optometrists
Organ & Tissue Donor Services
Orthotics
Physical Therapy
Placenta Encapsulations
Prenatal/Perinatal Care
Prosthetics
Prosthodontists
Reflexology
Rehabilitation Center
Skilled Nursing
Speech Therapists
Sperm Clinic
Ultrasound Imaging Centers
Urgent Care'''

In [34]:
ll = ll.lower()
health_list  = ll.split('\n')

In [35]:
len(health_list)

83

In [36]:
health_list[:10]

['behavior analysts',
 'chiropractors',
 'counseling & mental health',
 'psychologists',
 'sex therapists',
 'cryotherapy',
 'diagnostic services',
 'diagnostic imaging',
 'laboratory testing',
 'dialysis clinics']

## Create new dataframe with only healthcare categories

In [37]:
health = df[df['categories'].apply(lambda x: pd.Series(x.split(', ')).isin(health_list).any())]

In [38]:
health= health.reset_index(drop=True) # reset index

In [39]:
health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8850 entries, 0 to 8849
Data columns (total 8 columns):
business_id        8850 non-null object
categories         8850 non-null object
city               8850 non-null object
name               8850 non-null object
review_count       8850 non-null int64
star_avg           8850 non-null float64
state              8850 non-null object
categories_list    8850 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 553.2+ KB


In [41]:
health.head(2)

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,pbluh-ZZi_i8XlprfWB1nQ,"health & medical, hair salons, skin care, hair...",Cave Creek,Pamper Me Perfect Beauty & Wellness,12,4.5,AZ,"[health & medical, hair salons, skin care, ..."
1,iTZDPTc36guXhNhoK-jwmw,"health & medical, physical therapy, retirement...",Gilbert,Wellsprings of Gilbert,4,5.0,AZ,"[health & medical, physical therapy, retirem..."


## Count categories again

In [42]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(health.categories_list)):
    for j in health.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews2 = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews2.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 391 ms, sys: 0 ns, total: 391 ms
Wall time: 400 ms


In [43]:
len(cat_reviews2)

466

In [44]:
cat_reviews2[:10]

Unnamed: 0,total_reviews
health & medical,8850
doctors,5011
medical centers,1284
beauty & spas,1114
chiropractors,1105
optometrists,1004
shopping,909
physical therapy,757
massage therapy,704
eyewear & opticians,675


## Create list

In [45]:
h_list = list(cat_reviews2.iloc[:].index)

In [46]:
len(h_list)

466

In [47]:
h_list[:10]

['health & medical',
 'doctors',
 'medical centers',
 'beauty & spas',
 'chiropractors',
 'optometrists',
 'shopping',
 'physical therapy',
 'massage therapy',
 'eyewear & opticians']

## What is in both list

In [48]:
## what is in both lists
len(set(health_list) & set(h_list))

79

In [49]:
len(list(set(h_list) - set(health_list)))

387

In [50]:
list(set(h_list) - set(health_list))

['hospice',
 'wigs',
 'local services',
 'sports bars',
 'auto parts & supplies',
 'aerial fitness',
 'session photography',
 'furniture stores',
 'eyewear & opticians',
 'boxing',
 'wedding planning',
 'furniture rental',
 'transmission repair',
 'oral surgeons',
 'spray tanning',
 'veterinarians',
 'massage therapy',
 'translation services',
 'midwives',
 'orthodontists',
 'transportation',
 'air duct cleaning',
 'personal care services',
 'acupuncture',
 'osteopaths',
 'health coach',
 'police departments',
 'gift shops',
 'live/raw food',
 'performing arts',
 'baby gear & furniture',
 'educational services',
 'personal shopping',
 'electronics',
 'golf lessons',
 'party supplies',
 'venues & event spaces',
 'uniforms',
 'damage restoration',
 'community service/non-profit',
 'art galleries',
 'personal injury law',
 'laser hair removal',
 'mediators',
 'valet services',
 'kids activities',
 'qi gong',
 'mobility equipment sales & services',
 'vegetarian',
 'cooking schools',
 'loca

In [51]:
remove =  list(set(h_list) - set(health_list))
remove.remove('health & medical')  # we need it

## create new dataframe with unwanted items

In [52]:
%%time
remove_health = health[health['categories'].apply(lambda x: pd.Series(x.split(', ')).isin(remove).any())]

CPU times: user 4.02 s, sys: 15.6 ms, total: 4.03 s
Wall time: 4 s


In [53]:
remove_health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4065 entries, 0 to 8848
Data columns (total 8 columns):
business_id        4065 non-null object
categories         4065 non-null object
city               4065 non-null object
name               4065 non-null object
review_count       4065 non-null int64
star_avg           4065 non-null float64
state              4065 non-null object
categories_list    4065 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 285.8+ KB


In [54]:
remove_health.head(2)

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,pbluh-ZZi_i8XlprfWB1nQ,"health & medical, hair salons, skin care, hair...",Cave Creek,Pamper Me Perfect Beauty & Wellness,12,4.5,AZ,"[health & medical, hair salons, skin care, ..."
1,iTZDPTc36guXhNhoK-jwmw,"health & medical, physical therapy, retirement...",Gilbert,Wellsprings of Gilbert,4,5.0,AZ,"[health & medical, physical therapy, retirem..."


## remove rows bases on new dataframe
- remove base on idx

In [55]:
health.drop(remove_health.index, inplace=True)  # we are removing rows from health that are in remove_health by its index

In [56]:
health = health.reset_index(drop=True)  # reset index
health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4785 entries, 0 to 4784
Data columns (total 8 columns):
business_id        4785 non-null object
categories         4785 non-null object
city               4785 non-null object
name               4785 non-null object
review_count       4785 non-null int64
star_avg           4785 non-null float64
state              4785 non-null object
categories_list    4785 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 299.1+ KB


In [57]:
health.head(2)

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical",Gilbert,"Lauren Byrne, MD",9,2.5,AZ,"[urologists, doctors, health & medical]"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Henderson,Black Mountain Orthopaedics,5,3.5,NV,"[doctors, orthopedists, health & medical]"


## Count categories one last time

In [58]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(health.categories_list)):
    for j in health.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews3 = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews3.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 219 ms, sys: 0 ns, total: 219 ms
Wall time: 215 ms


In [59]:
cat_reviews3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75 entries, health & medical to otologists
Data columns (total 1 columns):
total_reviews    75 non-null int64
dtypes: int64(1)
memory usage: 1.2+ KB


In [60]:
cat_reviews3[:12]

Unnamed: 0,total_reviews
health & medical,4785
doctors,3133
medical centers,795
chiropractors,557
family practice,492
urgent care,347
obstetricians & gynecologists,345
diagnostic services,344
internal medicine,299
physical therapy,279


In [61]:
cat_reviews3[2:12]

Unnamed: 0,total_reviews
medical centers,795
chiropractors,557
family practice,492
urgent care,347
obstetricians & gynecologists,345
diagnostic services,344
internal medicine,299
physical therapy,279
hospitals,279
pediatricians,276


In [62]:
cat_reviews3[2:12].values.sum()

4013

## Categorise business

In [63]:
# locations
mc = health.categories.str.contains('medical centers') 
ho = health.categories.str.contains('hospitals')
uc = health.categories.str.contains('urgent care')

# specialist
ch = health.categories.str.contains('chiropractors')
fp = health.categories.str.contains('family practice')
ob = health.categories.str.contains('obstetricians & gynecologists')
ds = health.categories.str.contains('diagnostic services')
im = health.categories.str.contains('internal medicine')
pt = health.categories.str.contains('physical therapy')
pd = health.categories.str.contains('pediatricians')
mh = health.categories.str.contains('counseling & mental health')# counseling & mental health	

# Create dataframes
- specialist :
- locations

In [64]:
specialist = health[ch|fp|ob|ds|im|pt|pd|mh]
locations = health[mc|ho|uc]

In [65]:
specialist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2473 entries, 2 to 4784
Data columns (total 8 columns):
business_id        2473 non-null object
categories         2473 non-null object
city               2473 non-null object
name               2473 non-null object
review_count       2473 non-null int64
star_avg           2473 non-null float64
state              2473 non-null object
categories_list    2473 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 173.9+ KB


In [66]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1162 entries, 6 to 4779
Data columns (total 8 columns):
business_id        1162 non-null object
categories         1162 non-null object
city               1162 non-null object
name               1162 non-null object
review_count       1162 non-null int64
star_avg           1162 non-null float64
state              1162 non-null object
categories_list    1162 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 81.7+ KB


In [67]:
clean_locations = locations[~locations.business_id.isin(specialist.business_id.unique())]  # remove business that are in the specialist dataframe

In [None]:
# clean_locations = clean_locations.reset_index(drop=True)  # reset index
# clean_locations.info()


In [None]:
# specialist = specialist.reset_index(drop=True)  # reset index
# specialist.info()

In [68]:
# locations
lmc = clean_locations.categories.str.contains('medical centers') 
lho = clean_locations.categories.str.contains('hospitals')
luc = clean_locations.categories.str.contains('urgent care')

# specialist
sch = specialist.categories.str.contains('chiropractors')
sfp = specialist.categories.str.contains('family practice')
sob = specialist.categories.str.contains('obstetricians & gynecologists')
sds = specialist.categories.str.contains('diagnostic services')
sim = specialist.categories.str.contains('internal medicine')
spt = specialist.categories.str.contains('physical therapy')
spd = specialist.categories.str.contains('pediatricians')
smh = specialist.categories.str.contains('counseling & mental health')# counseling & mental health	

In [69]:
medical_center = clean_locations[lmc]
hospital = clean_locations[lho]
urgent_care = clean_locations[luc]

In [70]:
urgent_care.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 241 entries, 26 to 4767
Data columns (total 8 columns):
business_id        241 non-null object
categories         241 non-null object
city               241 non-null object
name               241 non-null object
review_count       241 non-null int64
star_avg           241 non-null float64
state              241 non-null object
categories_list    241 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 16.9+ KB


In [71]:
medical_center.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 503 entries, 20 to 4779
Data columns (total 8 columns):
business_id        503 non-null object
categories         503 non-null object
city               503 non-null object
name               503 non-null object
review_count       503 non-null int64
star_avg           503 non-null float64
state              503 non-null object
categories_list    503 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 35.4+ KB


In [72]:
hospital.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 50 to 4774
Data columns (total 8 columns):
business_id        226 non-null object
categories         226 non-null object
city               226 non-null object
name               226 non-null object
review_count       226 non-null int64
star_avg           226 non-null float64
state              226 non-null object
categories_list    226 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 15.9+ KB


In [73]:
medical_center = medical_center[~medical_center.business_id.isin(urgent_care.business_id.unique())]
medical_center = medical_center[~medical_center.business_id.isin(hospital.business_id.unique())]
hospital = hospital[~hospital.business_id.isin(urgent_care.business_id.unique())]

In [74]:
medical_center.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 20 to 4779
Data columns (total 8 columns):
business_id        346 non-null object
categories         346 non-null object
city               346 non-null object
name               346 non-null object
review_count       346 non-null int64
star_avg           346 non-null float64
state              346 non-null object
categories_list    346 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 24.3+ KB


In [75]:
hospital.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 50 to 4774
Data columns (total 8 columns):
business_id        199 non-null object
categories         199 non-null object
city               199 non-null object
name               199 non-null object
review_count       199 non-null int64
star_avg           199 non-null float64
state              199 non-null object
categories_list    199 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 14.0+ KB


In [76]:
chiropractors = specialist[sch]
family_practice =  specialist[sfp]
obstetrician = specialist[sob]
diagnostic_service = specialist[sds]
internal_medicine = specialist[spd]
physical_therapy = specialist[spt]
pediatricians = specialist[spd]
mental_health = specialist[smh]

## Mental Health

In [77]:
mental_health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 43 to 4775
Data columns (total 8 columns):
business_id        173 non-null object
categories         173 non-null object
city               173 non-null object
name               173 non-null object
review_count       173 non-null int64
star_avg           173 non-null float64
state              173 non-null object
categories_list    173 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 12.2+ KB


In [78]:
mental_health = mental_health[~mental_health.business_id.isin(pediatricians.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(physical_therapy.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(internal_medicine.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(diagnostic_service.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(obstetrician.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(family_practice.business_id.unique())]

mental_health = mental_health[~mental_health.business_id.isin(chiropractors.business_id.unique())]

In [79]:
mental_health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 43 to 4775
Data columns (total 8 columns):
business_id        158 non-null object
categories         158 non-null object
city               158 non-null object
name               158 non-null object
review_count       158 non-null int64
star_avg           158 non-null float64
state              158 non-null object
categories_list    158 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 11.1+ KB


## pediatricians

In [80]:
pediatricians.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 12 to 4778
Data columns (total 8 columns):
business_id        276 non-null object
categories         276 non-null object
city               276 non-null object
name               276 non-null object
review_count       276 non-null int64
star_avg           276 non-null float64
state              276 non-null object
categories_list    276 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 19.4+ KB


In [81]:
pediatricians[pediatricians.business_id.isin(physical_therapy.business_id.unique())]

pediatricians = pediatricians[~pediatricians.business_id.isin(internal_medicine.business_id.unique())]

pediatricians = pediatricians[~pediatricians.business_id.isin(diagnostic_service.business_id.unique())]

pediatricians = pediatricians[~pediatricians.business_id.isin(obstetrician.business_id.unique())]

pediatricians = pediatricians[~pediatricians.business_id.isin(family_practice.business_id.unique())]

pediatricians = pediatricians[~pediatricians.business_id.isin(chiropractors.business_id.unique())]

In [82]:
pediatricians.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 8 columns):
business_id        0 non-null object
categories         0 non-null object
city               0 non-null object
name               0 non-null object
review_count       0 non-null int64
star_avg           0 non-null float64
state              0 non-null object
categories_list    0 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 0.0+ bytes


## physical_therapy

In [83]:
physical_therapy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279 entries, 3 to 4776
Data columns (total 8 columns):
business_id        279 non-null object
categories         279 non-null object
city               279 non-null object
name               279 non-null object
review_count       279 non-null int64
star_avg           279 non-null float64
state              279 non-null object
categories_list    279 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 19.6+ KB


In [84]:
physical_therapy[physical_therapy.business_id.isin(internal_medicine.business_id.unique())]

physical_therapy = physical_therapy[~physical_therapy.business_id.isin(diagnostic_service.business_id.unique())]

physical_therapy = physical_therapy[~physical_therapy.business_id.isin(obstetrician.business_id.unique())]

physical_therapy = physical_therapy[~physical_therapy.business_id.isin(family_practice.business_id.unique())]

physical_therapy = physical_therapy[~physical_therapy.business_id.isin(chiropractors.business_id.unique())]

In [85]:
physical_therapy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 3 to 4776
Data columns (total 8 columns):
business_id        216 non-null object
categories         216 non-null object
city               216 non-null object
name               216 non-null object
review_count       216 non-null int64
star_avg           216 non-null float64
state              216 non-null object
categories_list    216 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 15.2+ KB


## internal_medicine

In [86]:
internal_medicine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 12 to 4778
Data columns (total 8 columns):
business_id        276 non-null object
categories         276 non-null object
city               276 non-null object
name               276 non-null object
review_count       276 non-null int64
star_avg           276 non-null float64
state              276 non-null object
categories_list    276 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 19.4+ KB


In [87]:
internal_medicine = internal_medicine[~internal_medicine.business_id.isin(diagnostic_service.business_id.unique())]

internal_medicine = internal_medicine[~internal_medicine.business_id.isin(obstetrician.business_id.unique())]

internal_medicine = internal_medicine[~internal_medicine.business_id.isin(family_practice.business_id.unique())]

internal_medicine = internal_medicine[~internal_medicine.business_id.isin(chiropractors.business_id.unique())]

In [88]:
internal_medicine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222 entries, 12 to 4778
Data columns (total 8 columns):
business_id        222 non-null object
categories         222 non-null object
city               222 non-null object
name               222 non-null object
review_count       222 non-null int64
star_avg           222 non-null float64
state              222 non-null object
categories_list    222 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 15.6+ KB


## diagnostic_service

In [89]:
diagnostic_service.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 344 entries, 4 to 4765
Data columns (total 8 columns):
business_id        344 non-null object
categories         344 non-null object
city               344 non-null object
name               344 non-null object
review_count       344 non-null int64
star_avg           344 non-null float64
state              344 non-null object
categories_list    344 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 24.2+ KB


In [90]:
diagnostic_service = diagnostic_service[~diagnostic_service.business_id.isin(obstetrician.business_id.unique())]

diagnostic_service = diagnostic_service[~diagnostic_service.business_id.isin(family_practice.business_id.unique())]

diagnostic_service = diagnostic_service[~diagnostic_service.business_id.isin(chiropractors.business_id.unique())]

In [91]:
diagnostic_service.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312 entries, 4 to 4765
Data columns (total 8 columns):
business_id        312 non-null object
categories         312 non-null object
city               312 non-null object
name               312 non-null object
review_count       312 non-null int64
star_avg           312 non-null float64
state              312 non-null object
categories_list    312 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 21.9+ KB


## obstetrician


In [92]:
obstetrician.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 345 entries, 2 to 4782
Data columns (total 8 columns):
business_id        345 non-null object
categories         345 non-null object
city               345 non-null object
name               345 non-null object
review_count       345 non-null int64
star_avg           345 non-null float64
state              345 non-null object
categories_list    345 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 24.3+ KB


In [94]:
#obstetrician[obstetrician.business_id.isin(diagnostic_service.business_id.unique())]

obstetrician = obstetrician[~obstetrician.business_id.isin(family_practice.business_id.unique())]

#obstetrician[obstetrician.business_id.isin(chiropractors.business_id.unique())]

In [95]:
obstetrician.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 2 to 4782
Data columns (total 8 columns):
business_id        332 non-null object
categories         332 non-null object
city               332 non-null object
name               332 non-null object
review_count       332 non-null int64
star_avg           332 non-null float64
state              332 non-null object
categories_list    332 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 23.3+ KB


## family_practice

In [96]:
family_practice.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 13 to 4784
Data columns (total 8 columns):
business_id        492 non-null object
categories         492 non-null object
city               492 non-null object
name               492 non-null object
review_count       492 non-null int64
star_avg           492 non-null float64
state              492 non-null object
categories_list    492 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 34.6+ KB


In [97]:
family_practice = family_practice[~family_practice.business_id.isin(chiropractors.business_id.unique())]

In [98]:
family_practice.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 479 entries, 13 to 4784
Data columns (total 8 columns):
business_id        479 non-null object
categories         479 non-null object
city               479 non-null object
name               479 non-null object
review_count       479 non-null int64
star_avg           479 non-null float64
state              479 non-null object
categories_list    479 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 33.7+ KB


## chiropractors

In [99]:
chiropractors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 557 entries, 11 to 4766
Data columns (total 8 columns):
business_id        557 non-null object
categories         557 non-null object
city               557 non-null object
name               557 non-null object
review_count       557 non-null int64
star_avg           557 non-null float64
state              557 non-null object
categories_list    557 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 39.2+ KB


In [None]:
#chiropractors[chiropractors.business_id.isin(medical_center.business_id.unique())]

In [100]:
chiropractors['health_business'] = 'chiropractors'
family_practice['health_business'] = 'family practice'
obstetrician['health_business'] = 'obstetrician'
diagnostic_service['health_business'] = 'diagnostic service'
internal_medicine['health_business'] = 'internal medicine'
physical_therapy['health_business'] = 'physical therapy'
#pediatricians['health_business'] = 'pediatricians'
mental_health['health_business'] = 'mental health'
medical_center['health_business'] = 'hospital'
hospital['health_business'] = 'hospital'
urgent_care['health_business'] = 'urgent care'

In [101]:
frames = [urgent_care,
          hospital,
          medical_center,
          mental_health,
          #pediatricians,
          physical_therapy,
          internal_medicine,
          diagnostic_service,
          obstetrician,
         family_practice,
          chiropractors]

In [103]:
import pandas as pd

In [104]:
final = pd.concat(frames, ignore_index=True)

In [105]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3062 entries, 0 to 3061
Data columns (total 9 columns):
business_id        3062 non-null object
categories         3062 non-null object
city               3062 non-null object
name               3062 non-null object
review_count       3062 non-null int64
star_avg           3062 non-null float64
state              3062 non-null object
categories_list    3062 non-null object
health_business    3062 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 215.4+ KB


In [106]:
final.head()

Unnamed: 0,business_id,categories,city,name,review_count,star_avg,state,categories_list,health_business
0,2hpi6pXIFf0taDIYCoNIuw,"health & medical, urgent care",Las Vegas,Healthcare Partner,80,2.5,NV,"[health & medical, urgent care]",urgent care
1,EXS2vZ60ad1LGfJKBdwiWg,"doctors, health & medical, urgent care, medica...",Scottsdale,FastMed Urgent Care,122,2.5,AZ,"[doctors, health & medical, urgent care, me...",urgent care
2,JdzoXkjb4uHLTEkiaSZRuQ,"doctors, urgent care, health & medical",Goodyear,Banner Urgent Care,27,3.0,AZ,"[doctors, urgent care, health & medical]",urgent care
3,3_dwAO5gWSX0zNtJ7xAB9w,"hospitals, health & medical, urgent care, doctors",Anthem,HonorHealth Immediate Care - Gavilan Peak,20,2.0,AZ,"[hospitals, health & medical, urgent care, ...",urgent care
4,SPnKdy0k9npg8qonOQpaSw,"urgent care, health & medical",Indian Land,Doctors Care - Indian Land,4,1.0,SC,"[urgent care, health & medical]",urgent care


In [107]:
final.health_business.value_counts()

chiropractors         557
hospital              545
family practice       479
obstetrician          332
diagnostic service    312
urgent care           241
internal medicine     222
physical therapy      216
mental health         158
Name: health_business, dtype: int64

## Save dataframe

In [109]:
final.to_csv(data_directory_saves+'/final_health_business.csv')