In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re

In [10]:
path = 'yelp_data/health_business00.csv'

In [11]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5656 entries, 0 to 5655
Data columns (total 5 columns):
business_id        5656 non-null object
categories         5656 non-null object
name               5656 non-null object
state              5656 non-null object
categories_list    5656 non-null object
dtypes: object(5)
memory usage: 221.0+ KB


In [13]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV,"['doctors', ' orthopedists', ' health & medical']"
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV,"['doctors', ' eyebrow services', ' tattoo remo..."
3,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ,"['obstetricians & gynecologists', ' health & m..."
4,SaR_j1ev_d55IBzORph_lQ,"laser eye surgery/lasik, optometrists, ophthal...",LASIK MD,QC,"['laser eye surgery/lasik', ' optometrists', '..."


In [14]:
df = df[['business_id','categories','name','state']]
df.head()

Unnamed: 0,business_id,categories,name,state
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV
3,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ
4,SaR_j1ev_d55IBzORph_lQ,"laser eye surgery/lasik, optometrists, ophthal...",LASIK MD,QC


## Investigate df.state

- 

In [15]:
df.state.value_counts(dropna=False)

AZ     2818
NV     1581
NC      342
ON      332
PA      184
OH      166
AB       83
WI       73
QC       30
IL       21
SC       20
NI        3
CA        1
NYK       1
OR        1
Name: state, dtype: int64

# Filter only US location
### `sorry Canada`

Remove
- ON
- QC
- AB
- NI
- NYK

In [16]:
df.state.unique()

array(['AZ', 'NV', 'QC', 'ON', 'PA', 'NC', 'OH', 'IL', 'SC', 'WI', 'AB',
       'NI', 'CA', 'OR', 'NYK'], dtype=object)

In [17]:
on = df.state == 'ON'
qc = df.state == 'QC'
ab = df.state == 'AB'
ni = df.state == 'NI'
nyk = df.state == 'NYK'

In [18]:
df = df[~(on|qc|ab|ni|nyk)]
df = df.reset_index(drop=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5207 entries, 0 to 5206
Data columns (total 4 columns):
business_id    5207 non-null object
categories     5207 non-null object
name           5207 non-null object
state          5207 non-null object
dtypes: object(4)
memory usage: 162.8+ KB


In [20]:
list(df.columns)

['business_id', 'categories', 'name', 'state']

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5207 entries, 0 to 5206
Data columns (total 4 columns):
business_id    5207 non-null object
categories     5207 non-null object
name           5207 non-null object
state          5207 non-null object
dtypes: object(4)
memory usage: 162.8+ KB


## Search our categories_list
- create df

```python
df.categories_list[0]
```

`"['urologists', ' doctors', ' health & medical']"`

In [22]:
df['categories_list'] = df.categories
df['categories_list']  = df['categories_list'].str.split(',')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5207 entries, 0 to 5206
Data columns (total 5 columns):
business_id        5207 non-null object
categories         5207 non-null object
name               5207 non-null object
state              5207 non-null object
categories_list    5207 non-null object
dtypes: object(5)
memory usage: 203.5+ KB


In [25]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"[urologists, doctors, health & medical]"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV,"[doctors, orthopedists, health & medical]"
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV,"[doctors, eyebrow services, tattoo removal, ..."
3,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ,"[obstetricians & gynecologists, health & medi..."
4,Q9v3zXGR2TduAKzn5B1BhQ,"medical centers, skin care, health & medical, ...",Summerlin Dermatology,NV,"[medical centers, skin care, health & medica..."


In [34]:
%%time
# make sure that categories_list shows up as a list not as a string of lists
counts = dict()
for i in range(len(df.categories_list)):
    for j in df.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1
        
cat_reviews = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews.sort_values(by=['total_reviews'], ascending=False, inplace=True)

CPU times: user 234 ms, sys: 15.6 ms, total: 250 ms
Wall time: 258 ms


In [35]:
cat_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 322 entries, health & medical to discount store
Data columns (total 1 columns):
total_reviews    322 non-null int64
dtypes: int64(1)
memory usage: 5.0+ KB


In [36]:
cat_reviews

Unnamed: 0,total_reviews
health & medical,5207
doctors,5011
medical centers,817
family practice,632
beauty & spas,594
urgent care,413
obstetricians & gynecologists,406
naturopathic/holistic,382
medical spas,362
internal medicine,350


## Inspect cat_review dataframe
- with over 322 individual categories lets remove the unanted

In [37]:
list(cat_reviews.iloc[:].index)
remove_list = [
 'hair removal',
 'laser hair removal',

 'tattoo removal',
 'active life',
 'massage',


 'reiki',
 'cannabis clinics',
 'day spas',

 'traditional chinese medicine',
 'tattoo',

 'life coach',

 'permanent makeup',

 'trainers',
 'eyelash service',
 'local services',

 'yoga',

 'cosmetics & beauty supply',

 'pets',
 'cannabis dispensaries',


 'hair salons',
 'makeup artists',
 'food',
 'waxing',
 'veterinarians',

 'pet services',
 'alternative medicine',

 'gyms',

 'fashion',

 'eyebrow services',
 'piercing',
 'meditation centers',
 'medical cannabis referrals',

 'lawyers',
 'saunas',
 'supernatural readings',
 'health coach',
 'pilates',
 'shoe stores',
 'event planning & services',
 'sporting goods',
 'restaurants',
 'outdoor gear',
 'ayurveda',
 'contractors',
 'specialty food',
 'psychics',
 'jewelry',
 'pet stores',
 'advertising',
 'hotels & travel',
 'sports wear',
 'nail salons',
 'doulas',
 'herbal shops',
 'session photography',
 'photographers',
 'tai chi',
 'automotive',
 'emergency pet hospital',
 'sports clubs',
 'art galleries',
 'oxygen bars',
 'personal injury law',
 'spiritual shop',
 'health markets',
 'golf lessons',
 'float spa',
 'sunglasses',
 'martial arts',
 'transportation',
 'qi gong',
 'lice services',
 'lingerie',
 'rolfing',
 'photography stores & services',
 'grocery',
 'gift shops',
 'batting cages',
 'baby gear & furniture',
 'social clubs',
 'cannabis collective',
 'threading services',
 'tui na',
 'undersea/hyperbaric medicine',
 'juice bars & smoothies',
 'flowers & gifts',
 'business consulting',
 'divorce & family law',
 'home & garden',
 'preschools',
 'real estate services',
 'real estate agents',
 'self storage',
 'skilled nursing',
 'movers',
 'opera & ballet',
 'language schools',
 'windshield installation & repair',
 'career counseling',
 'tapas/small plates',
 'real estate',
 'team building activities',
 'bankruptcy law',
 'holistic animal care',
 'department stores',
 'mediterranean',
 'aerial fitness',
 'local flavor',
 'toxicologists',
 'employment agencies',
 'pet groomers',
 'pet boarding',
 'pet sitting',
 'climbing',
 'first aid classes',
 'fingerprinting',
 'child care & day care',
 'vocational & technical school',
 'sugaring',
 'shopping centers',
 'cosmetology schools',
 'officiants',
 'vegetarian',
 'nightlife',
 'bars',
 'diners',
 'shoe repair',
 'home decor',
 'farmers market',
 'driving schools',
 'taxis',
 'pizza',
 'furniture assembly',
 'security services',
 'towing',
 'furniture repair',
 'airport shuttles',
 'barbers',
 'convenience stores',
 'veterans organizations',
 'hair stylists',
 'general litigation',
 'feng shui',
 'personal chefs',
 'painters',
 'it services & computer repair',
 'telecommunications',
 'public transportation',
 'food delivery services',
 'travel services',
 'auto parts & supplies',
 'car stereo installation',
 'mediators',
 'horseback riding',
 'wigs',
 'kids activities',
 'water purification services',
 'building supplies',
 'heating & air conditioning/hvac',
 'water heater installation/repair',
 'plumbing',
 'art classes',
 'arts & crafts',
 'video/film production',
 'taekwondo',
 'american (traditional)',
 'hair extensions',
 'hotels',
 'chimney sweeps',
 'air duct cleaning',
 'swimming pools',
 'massage schools',
 'pet hospice',
 'landscaping',
 'leisure centers',
 'soccer',
 'web design',
 "men's hair salons",
 'discount store']

In [38]:
len(remove_list)

179

In [39]:
remove_health = df[df['categories'].apply(lambda x: pd.Series(x.split(', ')).isin(remove_list).any())]

In [40]:
remove_health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 635 entries, 2 to 5185
Data columns (total 5 columns):
business_id        635 non-null object
categories         635 non-null object
name               635 non-null object
state              635 non-null object
categories_list    635 non-null object
dtypes: object(5)
memory usage: 29.8+ KB


In [41]:
remove_health

Unnamed: 0,business_id,categories,name,state,categories_list
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV,"[doctors, eyebrow services, tattoo removal, ..."
4,Q9v3zXGR2TduAKzn5B1BhQ,"medical centers, skin care, health & medical, ...",Summerlin Dermatology,NV,"[medical centers, skin care, health & medica..."
21,2lAvN0x40Sgg5-rG8WEj3w,"dermatologists, shopping, doctors, skin care, ...",Linda Woodson Dermatology,NV,"[dermatologists, shopping, doctors, skin ca..."
23,si6nnecyffD85-m_oDolbw,"colonics, doctors, saunas, health & medical, n...",Ancient Therapies Studio,AZ,"[colonics, doctors, saunas, health & medica..."
32,JqBjbYoWT_Q14qThOiKL4A,"health & medical, acupuncture, yoga, active li...",AZ Health Path,AZ,"[health & medical, acupuncture, yoga, activ..."
38,1UCuElYKIgONqbe7FR9JVQ,"reiki, health & medical, physical therapy, doc...",Marks Massage Therapy and Pain Management,PA,"[reiki, health & medical, physical therapy, ..."
39,Gy5EvWbuRejXIg8B_4_S2w,"fitness & instruction, active life, profession...",Holistic Life Services,AZ,"[fitness & instruction, active life, profess..."
62,Icxhh649AppgoCWwIhmyKg,"chiropractors, beauty & spas, massage, health ...",InMotion Health & Wellness,AZ,"[chiropractors, beauty & spas, massage, hea..."
67,Brk4gEtKuzGrkHYasRoyuw,"doctors, active life, sports medicine, gyms, h...",Foothills Sports Medicine Physical Therapy,AZ,"[doctors, active life, sports medicine, gym..."
70,EoCKHx8RgrVrf_kieXi01g,"eyelash service, beauty & spas, skin care, hai...",Beverly Hills Rejuvenation Center,AZ,"[eyelash service, beauty & spas, skin care, ..."


In [42]:
remove_health.head()

Unnamed: 0,business_id,categories,name,state,categories_list
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV,"[doctors, eyebrow services, tattoo removal, ..."
4,Q9v3zXGR2TduAKzn5B1BhQ,"medical centers, skin care, health & medical, ...",Summerlin Dermatology,NV,"[medical centers, skin care, health & medica..."
21,2lAvN0x40Sgg5-rG8WEj3w,"dermatologists, shopping, doctors, skin care, ...",Linda Woodson Dermatology,NV,"[dermatologists, shopping, doctors, skin ca..."
23,si6nnecyffD85-m_oDolbw,"colonics, doctors, saunas, health & medical, n...",Ancient Therapies Studio,AZ,"[colonics, doctors, saunas, health & medica..."
32,JqBjbYoWT_Q14qThOiKL4A,"health & medical, acupuncture, yoga, active li...",AZ Health Path,AZ,"[health & medical, acupuncture, yoga, activ..."


In [43]:
remove_health.index

Int64Index([   2,    4,   21,   23,   32,   38,   39,   62,   67,   70,
            ...
            5133, 5136, 5143, 5151, 5159, 5162, 5176, 5179, 5184, 5185],
           dtype='int64', length=635)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5207 entries, 0 to 5206
Data columns (total 5 columns):
business_id        5207 non-null object
categories         5207 non-null object
name               5207 non-null object
state              5207 non-null object
categories_list    5207 non-null object
dtypes: object(5)
memory usage: 203.5+ KB


In [45]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"[urologists, doctors, health & medical]"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV,"[doctors, orthopedists, health & medical]"
2,WuiRF916fJl6aG5wC8qRAg,"doctors, eyebrow services, tattoo removal, bea...",Advanced Brows,NV,"[doctors, eyebrow services, tattoo removal, ..."
3,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ,"[obstetricians & gynecologists, health & medi..."
4,Q9v3zXGR2TduAKzn5B1BhQ,"medical centers, skin care, health & medical, ...",Summerlin Dermatology,NV,"[medical centers, skin care, health & medica..."


## Remove unwanted rows from df by using the index of remove_health

In [46]:
df.drop(remove_health.index, inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4572 entries, 0 to 5206
Data columns (total 5 columns):
business_id        4572 non-null object
categories         4572 non-null object
name               4572 non-null object
state              4572 non-null object
categories_list    4572 non-null object
dtypes: object(5)
memory usage: 214.3+ KB


In [48]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"[urologists, doctors, health & medical]"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV,"[doctors, orthopedists, health & medical]"
3,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ,"[obstetricians & gynecologists, health & medi..."
5,EjypAFLWk7k5gk4ebPJANQ,"psychologists, counseling & mental health, nat...",West Valley Naturopathic Center,AZ,"[psychologists, counseling & mental health, ..."
6,yOJrDZU5zkxJoZOJTZXLvQ,"neurologist, spine surgeons, cosmetic surgeons...","Michael Seiff, MD",NV,"[neurologist, spine surgeons, cosmetic surge..."


In [49]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"[urologists, doctors, health & medical]"
1,STcSSjPOZ0FWtjvjqw99Dg,"doctors, orthopedists, health & medical",Black Mountain Orthopaedics,NV,"[doctors, orthopedists, health & medical]"
2,WnRtn80rdsWOa2nWZ6G1iA,"obstetricians & gynecologists, health & medica...",Central Phoenix Women's Health Care,AZ,"[obstetricians & gynecologists, health & medi..."
3,EjypAFLWk7k5gk4ebPJANQ,"psychologists, counseling & mental health, nat...",West Valley Naturopathic Center,AZ,"[psychologists, counseling & mental health, ..."
4,yOJrDZU5zkxJoZOJTZXLvQ,"neurologist, spine surgeons, cosmetic surgeons...","Michael Seiff, MD",NV,"[neurologist, spine surgeons, cosmetic surge..."


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
business_id        4572 non-null object
categories         4572 non-null object
name               4572 non-null object
state              4572 non-null object
categories_list    4572 non-null object
dtypes: object(5)
memory usage: 178.7+ KB


In [51]:
df.categories_list[0]

['urologists', ' doctors', ' health & medical']

## Again

In [52]:
counts = dict()
for i in range(len(df.categories_list)):
    for j in df.categories_list[i]:
        j= j.lstrip()
        counts[j] = counts.get(j, 0) + 1

In [53]:
cat_reviews2 = pd.DataFrame.from_dict(counts,columns=['total_reviews'],orient='index')
cat_reviews2.sort_values(by=['total_reviews'], ascending=False, inplace=True)

In [55]:
cat_reviews2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, health & medical to sports psychologists
Data columns (total 1 columns):
total_reviews    136 non-null int64
dtypes: int64(1)
memory usage: 2.1+ KB


In [56]:
cat_reviews2

Unnamed: 0,total_reviews
health & medical,4572
doctors,4383
medical centers,741
family practice,600
urgent care,406
obstetricians & gynecologists,390
internal medicine,338
pediatricians,318
ophthalmologists,312
optometrists,306


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
business_id        4572 non-null object
categories         4572 non-null object
name               4572 non-null object
state              4572 non-null object
categories_list    4572 non-null object
dtypes: object(5)
memory usage: 178.7+ KB


# Review
- removed unwanted locations
- some unwanted locations still show up like naturopathic/holistic
- could be fix in future

## Final healthcare business
- 4570

## Save

In [60]:
df.to_csv('yelp_data/health_business01.csv')