In [1]:
import warnings
warnings.filterwarnings('ignore')

In [73]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

## Data directory

```
├── data_wrangling
│   ├── data_clean_v2
│   │   └── Untitled.ipynb
├── data
│   └── yelp_datasets
│       ├── yelp_academic_dataset_business.json
│       └── yelp_academic_dataset_review.json
```

In [3]:
# data folder path
data_directory = os.path.join('..', '..','data','yelp_datasets')
data_directory_saves = os.path.join('..', '..','data','clean_data')

# data file location
businesses_dataset = os.path.join(data_directory,'yelp_academic_dataset_business.json')

In [4]:
%%time
df = pd.read_json(businesses_dataset, lines=True)

CPU times: user 6.38 s, sys: 3.66 s, total: 10 s
Wall time: 10 s


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188593 entries, 0 to 188592
Data columns (total 15 columns):
address         188593 non-null object
attributes      162807 non-null object
business_id     188593 non-null object
categories      188052 non-null object
city            188593 non-null object
hours           143791 non-null object
is_open         188593 non-null int64
latitude        188587 non-null float64
longitude       188587 non-null float64
name            188593 non-null object
neighborhood    188593 non-null object
postal_code     188593 non-null object
review_count    188593 non-null int64
stars           188593 non-null float64
state           188593 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 21.6+ MB


In [6]:
df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC
3,211 W Monroe St,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,,1,33.449999,-112.076979,Geico Insurance,,85003,8,1.5,AZ
4,2005 Alyth Place SE,{'BusinessAcceptsCreditCards': 'True'},8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",Calgary,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,51.035591,-114.027366,Action Engine,,T2H 0N5,4,2.0,AB


# Change column name
- stars to star_avg

In [7]:
df = df.rename(columns={'stars': 'star_avg'})

## Drop null categories

In [8]:
len(df[df.categories.isnull()])

541

## create null categorie filter 

In [9]:
null_cat = df.categories.isnull()

In [10]:
len(df[null_cat])

541

In [11]:
len(df[~null_cat])

188052

In [12]:
df = df[~null_cat] # display rows NOT in null_cat filter

In [13]:
df = df.reset_index(drop=True)  # reset index

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188052 entries, 0 to 188051
Data columns (total 15 columns):
address         188052 non-null object
attributes      162750 non-null object
business_id     188052 non-null object
categories      188052 non-null object
city            188052 non-null object
hours           143747 non-null object
is_open         188052 non-null int64
latitude        188046 non-null float64
longitude       188046 non-null float64
name            188052 non-null object
neighborhood    188052 non-null object
postal_code     188052 non-null object
review_count    188052 non-null int64
star_avg        188052 non-null float64
state           188052 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 21.5+ MB


## State
- select only us locations
    - sorry Canada

In [15]:
df.state.value_counts(dropna=False)[:10]

AZ    56296
NV    35586
ON    32315
NC    14320
OH    13628
PA    10940
QC     8727
AB     7656
WI     5034
IL     1932
Name: state, dtype: int64

In [16]:
df.state.unique()

array(['AB', 'NV', 'QC', 'AZ', 'ON', 'PA', 'OH', 'IL', 'WI', 'NC', 'BY',
       'NYK', 'SC', 'C', 'XGM', 'ST', 'IN', 'RP', 'CMA', 'NI', 'NLK',
       'VS', '6', 'CO', 'HE', 'VA', 'RCC', '01', 'SG', 'NY', 'OR', 'NW',
       '4', '10', 'CC', 'CA', '45', 'LU', 'MT', 'G', 'PO', 'B', 'VT',
       'AL', 'WAR', 'MO', 'HU', 'M', 'AR', 'O', 'FL', 'WA', 'CRF', 'TAM',
       'NE', 'XMS', 'GA', 'AG', 'WHT', 'MA', 'V', 'BC', 'SP', 'DE', 'HH',
       '11', 'CS', 'MN'], dtype=object)

In [17]:
on = df.state == 'ON'
qc = df.state == 'QC'
ab = df.state == 'AB'
ni = df.state == 'NI'
nyk = df.state == 'NYK'

In [18]:
df = df[~(on|qc|ab|ni|nyk)]
df = df.reset_index(drop=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139057 entries, 0 to 139056
Data columns (total 15 columns):
address         139057 non-null object
attributes      119919 non-null object
business_id     139057 non-null object
categories      139057 non-null object
city            139057 non-null object
hours           108394 non-null object
is_open         139057 non-null int64
latitude        139053 non-null float64
longitude       139053 non-null float64
name            139057 non-null object
neighborhood    139057 non-null object
postal_code     139057 non-null object
review_count    139057 non-null int64
star_avg        139057 non-null float64
state           139057 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 15.9+ MB


## drop columns
- drop unused columns

In [20]:
list(df.columns)

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'star_avg',
 'state']

In [21]:
df.drop(['hours','is_open','latitude','longitude','neighborhood','postal_code','address' ],axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state
0,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV
1,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,Geico Insurance,8,1.5,AZ
2,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",45bWSZtniwPRiqlivpS8Og,"Coffee & Tea, Food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ
3,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",8-NRKkPY1UiFXW20WXKiXg,"Mexican, Restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ
4,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",UTm5QZThPQlT35mkAcGOjg,"Flowers & Gifts, Gift Shops, Shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139057 entries, 0 to 139056
Data columns (total 8 columns):
attributes      119919 non-null object
business_id     139057 non-null object
categories      139057 non-null object
city            139057 non-null object
name            139057 non-null object
review_count    139057 non-null int64
star_avg        139057 non-null float64
state           139057 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 8.5+ MB


In [24]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state
0,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV
1,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,Geico Insurance,8,1.5,AZ
2,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",45bWSZtniwPRiqlivpS8Og,"Coffee & Tea, Food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ
3,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",8-NRKkPY1UiFXW20WXKiXg,"Mexican, Restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ
4,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",UTm5QZThPQlT35mkAcGOjg,"Flowers & Gifts, Gift Shops, Shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA


In [25]:
df.city.value_counts()[:10]

Las Vegas     28785
Phoenix       18552
Charlotte      9179
Scottsdale     8803
Pittsburgh     6790
Mesa           6219
Henderson      4802
Tempe          4477
Chandler       4259
Madison        3504
Name: city, dtype: int64

## Categories

In [26]:
df.categories = df.categories.str.lower()
df['categories_list'] = df.categories.str.lower()
df['categories_list']  = df['categories_list'].str.split(',')

In [27]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"chicken wings, burgers, caterers, street vendo...",Henderson,CK'S BBQ & Catering,3,4.5,NV,"[chicken wings, burgers, caterers, street v..."
1,,bFzdJJ3wp3PZssNEsyU23g,"insurance, financial services",Phoenix,Geico Insurance,8,1.5,AZ,"[insurance, financial services]"
2,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",45bWSZtniwPRiqlivpS8Og,"coffee & tea, food",Phoenix,The Coffee Bean & Tea Leaf,63,4.0,AZ,"[coffee & tea, food]"
3,"{'Alcohol': 'none', 'Ambience': '{'romantic': ...",8-NRKkPY1UiFXW20WXKiXg,"mexican, restaurants",Avondale,Filiberto's Mexican Food,40,2.5,AZ,"[mexican, restaurants]"
4,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",UTm5QZThPQlT35mkAcGOjg,"flowers & gifts, gift shops, shopping",Pittsburgh,Maggie & Stella's Gifts,3,3.5,PA,"[flowers & gifts, gift shops, shopping]"


In [28]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(df.categories_list)):
    for j in df.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 6.55 s, sys: 0 ns, total: 6.55 s
Wall time: 6.54 s


In [29]:
cat_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1275 entries, restaurants to makerspaces
Data columns (total 1 columns):
total_reviews    1275 non-null int64
dtypes: int64(1)
memory usage: 19.9+ KB


In [30]:
cat_reviews

Unnamed: 0,total_reviews
restaurants,34385
shopping,23189
food,17326
home services,17282
beauty & spas,14621
health & medical,14061
local services,11088
automotive,10866
nightlife,8487
event planning & services,7546


## Healthcare related
https://www.yelp.com/developers/documentation/v3/category_list
- select only healthcare related business

## out of 140, remove unrelated
- primary care
- specialist
- hospital
- clinic
- mental health


no dentist
no eyecare

In [31]:
ll = '''Behavior Analysts
Chiropractors
Counseling & Mental Health
Psychologists
Sex Therapists
Cryotherapy
Diagnostic Services
Diagnostic Imaging
Laboratory Testing
Dialysis Clinics
Doctors
Addiction Medicine
Allergists
Anesthesiologists
Audiologist
Cardiologists
Cosmetic Surgeons
Dermatologists
Ear Nose & Throat
Emergency Medicine
Endocrinologists
Family Practice
Fertility
Gastroenterologist
Geneticists
Gerontologists
Hepatologists
Hospitalists
Immunodermatologists
Infectious Disease Specialists
Internal Medicine
Neurologist
Neuropathologists
Neurotologists
Obstetricians & Gynecologists
Oncologist
Ophthalmologists
Orthopedists
Osteopathic Physicians
Otologists
Pain Management
Pathologists
Pediatricians
Phlebologists
Plastic Surgeons
Podiatrists
Preventive Medicine
Proctologists
Psychiatrists
Pulmonologist
Radiologists
Rheumatologists
Spine Surgeons
Surgeons
Toxicologists
Urologists
Vascular Medicine
Emergency Rooms
Habilitative Services
Hearing Aid Providers
Hospitals
Hypnosis/Hypnotherapy
Lactation Services
Medical Centers
Walk-in Clinics
Memory Care
Nurse Practitioner
Nutritionists
Optometrists
Organ & Tissue Donor Services
Orthotics
Physical Therapy
Placenta Encapsulations
Prenatal/Perinatal Care
Prosthetics
Prosthodontists
Reflexology
Rehabilitation Center
Skilled Nursing
Speech Therapists
Sperm Clinic
Ultrasound Imaging Centers
Urgent Care'''

In [32]:
ll = ll.lower()
health_list  = ll.split('\n')

In [33]:
len(health_list)

83

In [34]:
health_list[:10]

['behavior analysts',
 'chiropractors',
 'counseling & mental health',
 'psychologists',
 'sex therapists',
 'cryotherapy',
 'diagnostic services',
 'diagnostic imaging',
 'laboratory testing',
 'dialysis clinics']

## Create new dataframe with only healthcare categories

In [35]:
health = df[df['categories'].apply(lambda x: pd.Series(x.split(', ')).isin(health_list).any())]

In [36]:
health= health.reset_index(drop=True) # reset index

In [37]:
health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8850 entries, 0 to 8849
Data columns (total 9 columns):
attributes         8184 non-null object
business_id        8850 non-null object
categories         8850 non-null object
city               8850 non-null object
name               8850 non-null object
review_count       8850 non-null int64
star_avg           8850 non-null float64
state              8850 non-null object
categories_list    8850 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 622.3+ KB


In [38]:
health.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,"{'AcceptsInsurance': 'False', 'BikeParking': '...",pbluh-ZZi_i8XlprfWB1nQ,"health & medical, hair salons, skin care, hair...",Cave Creek,Pamper Me Perfect Beauty & Wellness,12,4.5,AZ,"[health & medical, hair salons, skin care, ..."
1,"{'AcceptsInsurance': 'True', 'ByAppointmentOnl...",iTZDPTc36guXhNhoK-jwmw,"health & medical, physical therapy, retirement...",Gilbert,Wellsprings of Gilbert,4,5.0,AZ,"[health & medical, physical therapy, retirem..."
2,"{'AcceptsInsurance': 'False', 'BusinessAccepts...",rmGQGPpk1XZFTkz-4IGHCw,"health & medical, diagnostic imaging, diagnost...",Charlotte,Baby Bundle 3d/4d Ultrasound,3,4.0,NC,"[health & medical, diagnostic imaging, diagn..."
3,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical",Gilbert,"Lauren Byrne, MD",9,2.5,AZ,"[urologists, doctors, health & medical]"
4,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",6sBCncAPTvC_xN64M2Qarw,"massage therapy, physical therapy, health & me...",Henderson,Tru Physical Therapy,5,5.0,NV,"[massage therapy, physical therapy, health &..."


## Count categories again

In [39]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(health.categories_list)):
    for j in health.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews2 = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews2.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 438 ms, sys: 15.6 ms, total: 453 ms
Wall time: 445 ms


In [40]:
len(cat_reviews2)

466

In [41]:
cat_reviews2[:10]

Unnamed: 0,total_reviews
health & medical,8850
doctors,5011
medical centers,1284
beauty & spas,1114
chiropractors,1105
optometrists,1004
shopping,909
physical therapy,757
massage therapy,704
eyewear & opticians,675


## Create list

In [42]:
h_list = list(cat_reviews2.iloc[:].index)

In [43]:
len(h_list)

466

In [44]:
h_list[:10]

['health & medical',
 'doctors',
 'medical centers',
 'beauty & spas',
 'chiropractors',
 'optometrists',
 'shopping',
 'physical therapy',
 'massage therapy',
 'eyewear & opticians']

## What is in both list

In [45]:
## what is in both lists
len(set(health_list) & set(h_list))

79

In [46]:
len(list(set(h_list) - set(health_list)))

387

In [47]:
list(set(h_list) - set(health_list))

['veterans organizations',
 'laser hair removal',
 'business consulting',
 'office equipment',
 'day spas',
 'tai chi',
 'elder care planning',
 'tattoo removal',
 'insurance',
 'auto parts & supplies',
 'sunglasses',
 'self storage',
 'party & event planning',
 'childbirth education',
 'animal physical therapy',
 'health retreats',
 'cardio classes',
 'sports medicine',
 'grocery',
 'valet services',
 'fitness/exercise equipment',
 'active life',
 'tattoo',
 'telecommunications',
 'party supplies',
 'life coach',
 'veterinarians',
 'cannabis collective',
 'water purification services',
 'swimming pools',
 'wedding planning',
 'cpr classes',
 'thrift stores',
 'pets',
 'dui law',
 'tours',
 'pet insurance',
 'language schools',
 'retirement homes',
 "men's hair salons",
 'farmers market',
 "children's clothing",
 'preschools',
 'vacation rentals',
 'pharmacy',
 'body contouring',
 'hospice',
 'special education',
 'home decor',
 'car stereo installation',
 'social clubs',
 'traditional

In [48]:
remove =  list(set(h_list) - set(health_list))
remove.remove('health & medical')  # we need it

## create new dataframe with unwanted items

In [49]:
%%time
remove_health = health[health['categories'].apply(lambda x: pd.Series(x.split(', ')).isin(remove).any())]

CPU times: user 4.16 s, sys: 0 ns, total: 4.16 s
Wall time: 4.15 s


In [50]:
remove_health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4065 entries, 0 to 8848
Data columns (total 9 columns):
attributes         3871 non-null object
business_id        4065 non-null object
categories         4065 non-null object
city               4065 non-null object
name               4065 non-null object
review_count       4065 non-null int64
star_avg           4065 non-null float64
state              4065 non-null object
categories_list    4065 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 317.6+ KB


In [51]:
remove_health.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,"{'AcceptsInsurance': 'False', 'BikeParking': '...",pbluh-ZZi_i8XlprfWB1nQ,"health & medical, hair salons, skin care, hair...",Cave Creek,Pamper Me Perfect Beauty & Wellness,12,4.5,AZ,"[health & medical, hair salons, skin care, ..."
1,"{'AcceptsInsurance': 'True', 'ByAppointmentOnl...",iTZDPTc36guXhNhoK-jwmw,"health & medical, physical therapy, retirement...",Gilbert,Wellsprings of Gilbert,4,5.0,AZ,"[health & medical, physical therapy, retirem..."
2,"{'AcceptsInsurance': 'False', 'BusinessAccepts...",rmGQGPpk1XZFTkz-4IGHCw,"health & medical, diagnostic imaging, diagnost...",Charlotte,Baby Bundle 3d/4d Ultrasound,3,4.0,NC,"[health & medical, diagnostic imaging, diagn..."
4,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",6sBCncAPTvC_xN64M2Qarw,"massage therapy, physical therapy, health & me...",Henderson,Tru Physical Therapy,5,5.0,NV,"[massage therapy, physical therapy, health &..."
6,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",North Las Vegas,Advanced Brows,4,5.0,NV,"[doctors, eyebrow services, tattoo removal, ..."


## remove rows bases on new dataframe
- remove base on idx

In [52]:
health.drop(remove_health.index, inplace=True)  # we are removing rows from health that are in remove_health by its index

In [53]:
health = health.reset_index(drop=True)  # reset index
health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4785 entries, 0 to 4784
Data columns (total 9 columns):
attributes         4313 non-null object
business_id        4785 non-null object
categories         4785 non-null object
city               4785 non-null object
name               4785 non-null object
review_count       4785 non-null int64
star_avg           4785 non-null float64
state              4785 non-null object
categories_list    4785 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 336.5+ KB


In [54]:
health.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list
0,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical",Gilbert,"Lauren Byrne, MD",9,2.5,AZ,"[urologists, doctors, health & medical]"
1,{'ByAppointmentOnly': 'True'},STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Henderson,Black Mountain Orthopaedics,5,3.5,NV,"[doctors, orthopedists, health & medical]"
2,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Phoenix,Central Phoenix Women's Health Care,34,3.5,AZ,"[obstetricians & gynecologists, health & medi..."
3,,l3DXx5PGPac-LdS0an4wQA,"health & medical, physical therapy",Phoenix,Apex Physical Therapy,8,3.5,AZ,"[health & medical, physical therapy]"
4,,aFJYPxKQbAt-0iBtOPNpuQ,"diagnostic services, health & medical",Litchfield Park,Labcorp,6,1.5,AZ,"[diagnostic services, health & medical]"


## Count categories one last time

In [55]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(health.categories_list)):
    for j in health.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews3 = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews3.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 219 ms, sys: 0 ns, total: 219 ms
Wall time: 229 ms


In [56]:
cat_reviews3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75 entries, health & medical to otologists
Data columns (total 1 columns):
total_reviews    75 non-null int64
dtypes: int64(1)
memory usage: 1.2+ KB


In [57]:
cat_reviews3[:12]

Unnamed: 0,total_reviews
health & medical,4785
doctors,3133
medical centers,795
chiropractors,557
family practice,492
urgent care,347
obstetricians & gynecologists,345
diagnostic services,344
internal medicine,299
physical therapy,279


In [58]:
cat_reviews3[2:12]

Unnamed: 0,total_reviews
medical centers,795
chiropractors,557
family practice,492
urgent care,347
obstetricians & gynecologists,345
diagnostic services,344
internal medicine,299
physical therapy,279
hospitals,279
pediatricians,276


In [59]:
cat_reviews3[2:12].values.sum()

4013

## Categorise business

In [60]:
# locations
mc = health.categories.str.contains('medical centers') 
ho = health.categories.str.contains('hospitals')
uc = health.categories.str.contains('urgent care')

# specialist
ch = health.categories.str.contains('chiropractors')
fp = health.categories.str.contains('family practice')
ob = health.categories.str.contains('obstetricians & gynecologists')
ds = health.categories.str.contains('diagnostic services')
im = health.categories.str.contains('internal medicine')
pt = health.categories.str.contains('physical therapy')
pd = health.categories.str.contains('pediatricians')
mh = health.categories.str.contains('counseling & mental health')# counseling & mental health	

# Create dataframes
- specialist :
- locations

In [61]:
specialist = health[ch|fp|ob|ds|im|pt|pd|mh]
locations = health[mc|ho|uc]

In [62]:
specialist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2473 entries, 2 to 4784
Data columns (total 9 columns):
attributes         2262 non-null object
business_id        2473 non-null object
categories         2473 non-null object
city               2473 non-null object
name               2473 non-null object
review_count       2473 non-null int64
star_avg           2473 non-null float64
state              2473 non-null object
categories_list    2473 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 193.2+ KB


In [63]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1162 entries, 6 to 4779
Data columns (total 9 columns):
attributes         1068 non-null object
business_id        1162 non-null object
categories         1162 non-null object
city               1162 non-null object
name               1162 non-null object
review_count       1162 non-null int64
star_avg           1162 non-null float64
state              1162 non-null object
categories_list    1162 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 90.8+ KB


In [64]:
clean_locations = locations[~locations.business_id.isin(specialist.business_id.unique())]  # remove business that are in the specialist dataframe

In [65]:
# clean_locations = clean_locations.reset_index(drop=True)  # reset index
# clean_locations.info()


In [66]:
# specialist = specialist.reset_index(drop=True)  # reset index
# specialist.info()

In [67]:
# locations
lmc = clean_locations.categories.str.contains('medical centers') 
lho = clean_locations.categories.str.contains('hospitals')
luc = clean_locations.categories.str.contains('urgent care')

# specialist
sch = specialist.categories.str.contains('chiropractors')
sfp = specialist.categories.str.contains('family practice')
sob = specialist.categories.str.contains('obstetricians & gynecologists')
sds = specialist.categories.str.contains('diagnostic services')
sim = specialist.categories.str.contains('internal medicine')
spt = specialist.categories.str.contains('physical therapy')
spd = specialist.categories.str.contains('pediatricians')
smh = specialist.categories.str.contains('counseling & mental health')# counseling & mental health	

In [68]:
medical_center = clean_locations[lmc]
hospital = clean_locations[lho]
urgent_care = clean_locations[luc]

In [69]:
chiropractors = specialist[sch]
family_practice =  specialist[sfp]
obstetrician = specialist[sob]
diagnostic_service = specialist[sds]
internal_medicine = specialist[spd]
physical_therapy = specialist[spt]
pediatricians = specialist[spd]
mental_health = specialist[smh]

In [70]:
chiropractors['health_business'] = 'chiropractors'
family_practice['health_business'] = 'family practice'
obstetrician['health_business'] = 'obstetrician'
diagnostic_service['health_business'] = 'diagnostic service'
internal_medicine['health_business'] = 'internal medicine'
physical_therapy['health_business'] = 'physical therapy'
pediatricians['health_business'] = 'pediatricians'
mental_health['health_business'] = 'mental health'
medical_center['health_business'] = 'hospital'
hospital['health_business'] = 'hospital'
urgent_care['health_business'] = 'urgent care'

In [71]:
frames = [urgent_care,
          hospital,
          medical_center,
          mental_health,
          pediatricians,
          physical_therapy,
          internal_medicine,
          diagnostic_service,
          obstetrician,
         family_practice,
          chiropractors]

In [74]:
final = pd.concat(frames, ignore_index=True)

In [75]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3712 entries, 0 to 3711
Data columns (total 10 columns):
attributes         3416 non-null object
business_id        3712 non-null object
categories         3712 non-null object
city               3712 non-null object
name               3712 non-null object
review_count       3712 non-null int64
star_avg           3712 non-null float64
state              3712 non-null object
categories_list    3712 non-null object
health_business    3712 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 290.1+ KB


In [76]:
final.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list,health_business
0,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",2hpi6pXIFf0taDIYCoNIuw,"health & medical, urgent care",Las Vegas,Healthcare Partner,80,2.5,NV,"[health & medical, urgent care]",urgent care
1,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",EXS2vZ60ad1LGfJKBdwiWg,"doctors, health & medical, urgent care, medica...",Scottsdale,FastMed Urgent Care,122,2.5,AZ,"[doctors, health & medical, urgent care, me...",urgent care
2,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",JdzoXkjb4uHLTEkiaSZRuQ,"doctors, urgent care, health & medical",Goodyear,Banner Urgent Care,27,3.0,AZ,"[doctors, urgent care, health & medical]",urgent care
3,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",3_dwAO5gWSX0zNtJ7xAB9w,"hospitals, health & medical, urgent care, doctors",Anthem,HonorHealth Immediate Care - Gavilan Peak,20,2.0,AZ,"[hospitals, health & medical, urgent care, ...",urgent care
4,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",SPnKdy0k9npg8qonOQpaSw,"urgent care, health & medical",Indian Land,Doctors Care - Indian Land,4,1.0,SC,"[urgent care, health & medical]",urgent care


In [77]:
final.health_business.value_counts()

hospital              729
chiropractors         557
family practice       492
obstetrician          345
diagnostic service    344
physical therapy      279
internal medicine     276
pediatricians         276
urgent care           241
mental health         173
Name: health_business, dtype: int64

In [78]:
final.head()

Unnamed: 0,attributes,business_id,categories,city,name,review_count,star_avg,state,categories_list,health_business
0,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",2hpi6pXIFf0taDIYCoNIuw,"health & medical, urgent care",Las Vegas,Healthcare Partner,80,2.5,NV,"[health & medical, urgent care]",urgent care
1,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",EXS2vZ60ad1LGfJKBdwiWg,"doctors, health & medical, urgent care, medica...",Scottsdale,FastMed Urgent Care,122,2.5,AZ,"[doctors, health & medical, urgent care, me...",urgent care
2,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",JdzoXkjb4uHLTEkiaSZRuQ,"doctors, urgent care, health & medical",Goodyear,Banner Urgent Care,27,3.0,AZ,"[doctors, urgent care, health & medical]",urgent care
3,"{'AcceptsInsurance': 'True', 'BusinessAcceptsC...",3_dwAO5gWSX0zNtJ7xAB9w,"hospitals, health & medical, urgent care, doctors",Anthem,HonorHealth Immediate Care - Gavilan Peak,20,2.0,AZ,"[hospitals, health & medical, urgent care, ...",urgent care
4,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",SPnKdy0k9npg8qonOQpaSw,"urgent care, health & medical",Indian Land,Doctors Care - Indian Land,4,1.0,SC,"[urgent care, health & medical]",urgent care


In [79]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3712 entries, 0 to 3711
Data columns (total 10 columns):
attributes         3416 non-null object
business_id        3712 non-null object
categories         3712 non-null object
city               3712 non-null object
name               3712 non-null object
review_count       3712 non-null int64
star_avg           3712 non-null float64
state              3712 non-null object
categories_list    3712 non-null object
health_business    3712 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 290.1+ KB


In [80]:
## Save dataframe

In [81]:
final.to_csv(data_directory_saves+'/final_health_business.csv')