### <font color = blue> Pre-processing: Air BNB data set

### <font color= '#eb3483'> Imports and mounting to drive

In [None]:
#imports 
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import drive

In [None]:
#Mount google drive
prefix = '/content/drive'
from google.colab import drive
drive.mount(prefix, force_remount=True)

Mounted at /content/drive


In [None]:
path = '/content/drive/My Drive/CIS 450/CIS 450 Project/NYC_Health_Hospitals.csv'

In [None]:
#load data into a dataframe
df = pd.read_csv(path, header=[0])
df.head(5)

Unnamed: 0,Facility Type,Borough,Facility Name,Cross Streets,Phone,Location 1,Postcode,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,Child Health Center,Manhattan,La Clinica Del Barrio,,212-410-7940,"413 120th Street\nNew York, NY 10035\n(40.7982...",,,,,,,,,
1,Acute Care Hospital,Queens,Elmhurst Hospital Center,,718-334-4000,"79 01\nElmhurst, NY 11373\n(40.738710402563, -...",,,,,,,,,
2,Child Health Center,Brooklyn,Ida G. Israel Community Health Center,W. 22nd St. & W. 23rd St.,718-946-3400,"2201 Neptune Avenue\nBrooklyn, NY 11224\n(40.5...",11224.0,40.578468,-73.989614,13.0,47.0,348.0,3188417.0,3069900000.0,Seagate-Coney Island ...
3,Child Health Center,Queens,South Queens Community Health Center,,718-883-6699,"114 02 Guy R Brewer Blvd\nJamaica, NY 11434\n(...",11434.0,40.688615,-73.785593,12.0,28.0,276.0,4264631.0,4122000000.0,Baisley Park ...
4,Child Health Center,Bronx,Melrose Houses Child Health Clinic,between Morris Ave. & Courtlandt Ave.,718-292-2820,"348 156th Street\nBronx, NY 10451\n(40.8213011...",,,,,,,,,


### <font color= '#eb3483'> Explore data types and values

In [None]:
df.dtypes

Facility Type        object
Borough              object
Facility Name        object
Cross Streets        object
Phone                object
Location 1           object
Postcode            float64
Latitude            float64
Longitude           float64
Community Board     float64
Council District    float64
Census Tract        float64
BIN                 float64
BBL                 float64
NTA                  object
dtype: object

In [None]:
df.shape

(78, 15)

In [None]:
#how many unique values do we have
df.apply('nunique').sort_values(ascending=True)

Facility Type        4
Borough              5
Community Board     14
Cross Streets       27
Council District    30
Postcode            38
NTA                 39
Census Tract        44
BBL                 46
Latitude            47
Longitude           47
BIN                 47
Phone               61
Facility Name       62
Location 1          63
dtype: int64

In [None]:
#look at percent null values 
df.isna().sum() * 100 / df.shape[0]

Facility Type        0.000000
Borough              0.000000
Facility Name        0.000000
Cross Streets       65.384615
Phone                0.000000
Location 1           0.000000
Postcode            23.076923
Latitude            23.076923
Longitude           23.076923
Community Board     23.076923
Council District    23.076923
Census Tract        23.076923
BIN                 23.076923
BBL                 23.076923
NTA                 23.076923
dtype: float64

In [None]:
df.duplicated().sum()

0

In [None]:
#only keep certain columns based on what we need
keep = [
        'Facility Type', 'Borough', 'Facility Name', 'Phone', 
        'Location 1', 'Latitude',
        'Longitude'
]
df = df[keep]

#### <font color= pink> Column: Facility Name


In [None]:
df = df.rename(axis=1, mapper={'Facility Name' : 'name'})

In [None]:
df.name.apply('nunique')

62

In [None]:
df[df.name.duplicated()]

Unnamed: 0,Facility Type,Borough,name,Phone,Location 1,Latitude,Longitude
27,Child Health Center,Bronx,Segundo Ruiz Belvis Diagnostic & Treatment Center,718-579-4000,"454 142nd Street\nBronx, NY 10454\n(40.8109013...",,
31,Child Health Center,Bronx,Jacobi Medical Center,718-918-5000,"1400 Pelham Parkway\nBronx, NY 10461\n(40.8573...",40.857427,-73.847079
32,Diagnostic & Treatment Center,Brooklyn,Cumberland Diagnostic & Treatment Center,718-260-7500,"100 Portland Avenue\nBrooklyn, NY 11205\n(40.6...",,
33,Diagnostic & Treatment Center,Manhattan,Renaissance Health Care Network Diagnostic & T...,212-932-6500,"215 125 Street\nNew York, NY 10027\n(40.809338...",,
36,Child Health Center,Brooklyn,Kings County Hospital Center,718-245-3131,"451 Clarkson Avenue\nBrooklyn, NY 11203\n(40.6...",40.655762,-73.94458
42,Child Health Center,Brooklyn,East New York Diagnostic & Treatment Center,718-240-0400,"2094 Pitkin Avenue\nBrooklyn, NY 11225\n(40.67...",40.671977,-73.895248
47,Child Health Center,Queens,Elmhurst Hospital Center,718-334-4000,"79 01\nElmhurst, NY 11373\n(40.738710402563, -...",,
51,Child Health Center,Manhattan,Metropolitan Hospital Center,212-423-6262,"1901 First Avenue\nNew York, NY 10029\n(40.784...",40.784557,-73.94378
54,Acute Care Hospital,Queens,Queens Hospital Center,718-883-3000,"82 70 164th Street\nJamaica, NY 11432\n(40.713...",40.717209,-73.803419
57,Nursing Home,Manhattan,Gouverneur Healthcare Services,212-238-7000,"227 Madison Street\nNew York, NY 10002\n(40.71...",40.712784,-73.988417


#### <font color= pink> Column: Location 1

In [None]:
#rename Location 1 to be more easily readable
df = df.rename(columns={'Location 1': 'location'})

In [None]:
df[['location']]

Unnamed: 0,location
0,"413 120th Street\nNew York, NY 10035\n(40.7982..."
1,"79 01\nElmhurst, NY 11373\n(40.738710402563, -..."
2,"2201 Neptune Avenue\nBrooklyn, NY 11224\n(40.5..."
3,"114 02 Guy R Brewer Blvd\nJamaica, NY 11434\n(..."
4,"348 156th Street\nBronx, NY 10451\n(40.8213011..."
...,...
73,"1420 Bushwick Avenue\nBrooklyn, NY 11207\n(40...."
74,"462 First Avenue\nNew York, NY 10016\n(40.7396..."
75,"769 Onderdonk Avenue\nRidgewood, NY 11385\n(40..."
76,"60 Madison Street\nNew York, NY 10038\n(40.711..."


In [None]:
df.location.isna().sum()

0

In [None]:
def fix_loc(s):
  st = s.replace('\n', ' ')
  return st

In [None]:
df.location = df.location.apply(lambda x: fix_loc(x))

In [None]:
df['location']

0     413 120th Street New York, NY 10035 (40.798205...
1     79 01 Elmhurst, NY 11373 (40.738710402563, -73...
2     2201 Neptune Avenue Brooklyn, NY 11224 (40.578...
3     114 02 Guy R Brewer Blvd Jamaica, NY 11434 (40...
4     348 156th Street Bronx, NY 10451 (40.821301194...
                            ...                        
73    1420 Bushwick Avenue Brooklyn, NY 11207 (40.68...
74    462 First Avenue New York, NY 10016 (40.739623...
75    769 Onderdonk Avenue Ridgewood, NY 11385 (40.7...
76    60 Madison Street New York, NY 10038 (40.71196...
77    1225 Gerard Avenue Bronx, NY 10452 (40.8359063...
Name: location, Length: 78, dtype: object

#### <font color= '#eb3483'> Columns: Latitude and Longitude

In [None]:
df.Latitude.isna().sum() 

18

In [None]:
df.Longitude.isna().sum() 

18

In [None]:
df = df.rename(axis=1, mapper={'Latitude': 'latitude', 'Longitude': 'longitude'})

In [None]:
def get_zip(s):
  l = s.find('(')
  r = s.find(')')
  if (l != -1 and r != -1):
    return s[l+1:r]
  else:
    return "NO" 

In [None]:
df['lat_long_extracted'] = df.location.apply(lambda x: get_zip(x))

In [None]:
#df[['lat_long_extracted']].value_counts()

In [None]:
def split_zip(z):
  return z.split(',')

In [None]:
df.lat_long_extracted = df.lat_long_extracted.apply(lambda x: split_zip(x))

We should figure out what we want to do with the location and what info to extract from this.

In [None]:
df['lat_from_loc'] = df.lat_long_extracted.apply(lambda x: x[0])
df['long_from_loc'] = df.lat_long_extracted.apply(lambda x: x[1])

In [None]:
df.latitude = df.latitude.fillna(df.lat_from_loc)

In [None]:
df.longitude = df.longitude.fillna(df.long_from_loc)

In [None]:
df[['latitude', 'longitude']].head()

Unnamed: 0,latitude,longitude
0,40.798205044469,-73.932667945174
1,40.738710402563,-73.878351155182
2,40.5785,-73.9896
3,40.6886,-73.7856
4,40.821301194646,-73.917709787212


In [None]:
df = df.drop(axis=1, columns=['lat_from_loc', 'long_from_loc', 'lat_long_extracted'])

In [None]:
df.head()

Unnamed: 0,Facility Type,Borough,name,Phone,location,latitude,longitude
0,Child Health Center,Manhattan,La Clinica Del Barrio,212-410-7940,"413 120th Street New York, NY 10035 (40.798205...",40.798205044469,-73.932667945174
1,Acute Care Hospital,Queens,Elmhurst Hospital Center,718-334-4000,"79 01 Elmhurst, NY 11373 (40.738710402563, -73...",40.738710402563,-73.878351155182
2,Child Health Center,Brooklyn,Ida G. Israel Community Health Center,718-946-3400,"2201 Neptune Avenue Brooklyn, NY 11224 (40.578...",40.5785,-73.9896
3,Child Health Center,Queens,South Queens Community Health Center,718-883-6699,"114 02 Guy R Brewer Blvd Jamaica, NY 11434 (40...",40.6886,-73.7856
4,Child Health Center,Bronx,Melrose Houses Child Health Clinic,718-292-2820,"348 156th Street Bronx, NY 10451 (40.821301194...",40.821301194646,-73.917709787212


In [None]:
df.dtypes

Facility Type    object
Borough          object
name             object
Phone            object
location         object
latitude         object
longitude        object
dtype: object

In [None]:
df.latitude = df.latitude.astype(float)
df.longitude = df.longitude.astype(float)

Lets go back and clean up the location now

In [None]:
def clean_loc(s):
  l = s.find('(')
  if (l != -1):
    return s[0 : l]
  else:
    return s

In [None]:
df.location = df.location.apply(lambda x: clean_loc(x))

In [None]:
df.location

0            413 120th Street New York, NY 10035 
1                       79 01 Elmhurst, NY 11373 
2         2201 Neptune Avenue Brooklyn, NY 11224 
3     114 02 Guy R Brewer Blvd Jamaica, NY 11434 
4               348 156th Street Bronx, NY 10451 
                         ...                     
73       1420 Bushwick Avenue Brooklyn, NY 11207 
74           462 First Avenue New York, NY 10016 
75      769 Onderdonk Avenue Ridgewood, NY 11385 
76          60 Madison Street New York, NY 10038 
77            1225 Gerard Avenue Bronx, NY 10452 
Name: location, Length: 78, dtype: object

In [None]:
def get_zipcode(s):
  l = len(s)
  zip = s[l-6 : l]
  return zip

In [None]:
df['zipcode'] = df.location.apply(lambda x: get_zipcode(x))

In [None]:
df.zipcode = df.zipcode.astype(int)

#### <font color= pink> Column: Facility Type

In [None]:
df['Facility Type'].isna().sum()

0

In [None]:
df['Facility Type'].value_counts()

Child Health Center              56
Acute Care Hospital              11
Diagnostic & Treatment Center     6
Nursing Home                      5
Name: Facility Type, dtype: int64

In [None]:
df = df.rename(axis=1, mapper={'Facility Type': 'type'})

In [None]:
100 * df.type.value_counts() / df.shape[0]

Child Health Center              71.794872
Acute Care Hospital              14.102564
Diagnostic & Treatment Center     7.692308
Nursing Home                      6.410256
Name: type, dtype: float64

#### <font color= pink> Column: Borough


In [None]:
df.Borough.value_counts()

Brooklyn         26
Manhattan        24
Bronx            14
Queens           11
Staten Island     3
Name: Borough, dtype: int64

In [None]:
df = df.rename(axis=1, mapper={'Borough': 'borough'})

#### <font color= pink> Column: Zipcode


In [None]:
df.head()

Unnamed: 0,type,borough,name,Phone,location,latitude,longitude,zipcode
0,Child Health Center,Manhattan,La Clinica Del Barrio,212-410-7940,"413 120th Street New York, NY 10035",40.798205,-73.932668,10035
1,Acute Care Hospital,Queens,Elmhurst Hospital Center,718-334-4000,"79 01 Elmhurst, NY 11373",40.73871,-73.878351,11373
2,Child Health Center,Brooklyn,Ida G. Israel Community Health Center,718-946-3400,"2201 Neptune Avenue Brooklyn, NY 11224",40.578468,-73.989614,11224
3,Child Health Center,Queens,South Queens Community Health Center,718-883-6699,"114 02 Guy R Brewer Blvd Jamaica, NY 11434",40.688615,-73.785593,11434
4,Child Health Center,Bronx,Melrose Houses Child Health Clinic,718-292-2820,"348 156th Street Bronx, NY 10451",40.821301,-73.91771,10451


In [None]:
df.zipcode.value_counts()

11206    5
10027    5
10002    4
11203    3
10451    3
11205    3
11432    3
10467    2
11211    2
10044    2
10029    2
11373    2
11207    2
11235    2
10461    2
10037    2
10452    2
10454    2
10016    2
11434    1
10035    1
10009    1
11222    1
10456    1
10457    1
10030    1
10032    1
10012    1
10034    1
11413    1
11385    1
10038    1
10303    1
10304    1
11201    1
11377    1
10314    1
11212    1
11213    1
11221    1
11224    1
11225    1
10469    1
11368    1
11372    1
11229    1
11208    1
Name: zipcode, dtype: int64

#### <font color= pink> Column: Phone

In [None]:
df.Phone.apply(lambda x: len(x)).value_counts()

12    75
13     2
22     1
Name: Phone, dtype: int64

In [None]:
df.Phone.sort_values()

57    212-238-7000
55    212-238-7000
56    212-238-7200
59    212-318-8000
76    212-346-0500
          ...     
28    718-960-2777
77    718-960-2777
45    718-963-7820
72    718-963-8000
11    718-963-8000
Name: Phone, Length: 78, dtype: object

In [None]:
df.iloc[df.agg({"Phone":len}).sort_values('Phone', ascending=False).index]

Unnamed: 0,type,borough,name,Phone,location,latitude,longitude,zipcode
58,Child Health Center,Brooklyn,Fort Greene Child Health Clinic,718-260-7500 ext. 7859,"295 Flatbush Avenue Extension Brooklyn, NY 11201",40.691986,-73.982496,11201
35,Nursing Home,Manhattan,Coler-Goldwater Specialty Hospital and Nursing...,212-848-6000,"900 Main Street Roosevelt Island New York, NY ...",40.769800,-73.941218,10044
33,Diagnostic & Treatment Center,Manhattan,Renaissance Health Care Network Diagnostic & T...,212-932-6500,"215 125 Street New York, NY 10027",40.809339,-73.949197,10027
49,Child Health Center,Staten Island,Mariner's Harbor Houses Child Health Clinic,718-761-2060,2040 Forest Avenue Staten Island NY 10303,40.626017,-74.156541,10303
56,Child Health Center,Manhattan,Gouverneur Diagnostic & Treatment Center,212-238-7200,"227 Madison Street New York, NY 10002",40.712784,-73.988417,10002
...,...,...,...,...,...,...,...,...
24,Child Health Center,Queens,Corona Child Health Clinic,718-334-6100,"104 04 Corona Avenue Corona, NY 11368",40.744001,-73.858404,11368
23,Child Health Center,Bronx,Lincoln Medical & Mental Health Center,718-579-5000,"234 149th Street Bronx, NY 10451",40.817688,-73.924200,10451
22,Nursing Home,Brooklyn,Dr. Susan Smith McKinney Nursing and Rehabilit...,718-245-7000,"594 Albany Avenue Brooklyn, NY 11203",40.659968,-73.939733,11203
21,Child Health Center,Manhattan,Sydenham Health Center,212-932-6500,"215 125th Street New York, NY 10027",40.809339,-73.949197,10027


In [None]:
df.Phone = df.Phone.apply(lambda x: x[0:13])

In [None]:
df = df.rename(axis=1, mapper={'Phone' : 'phone'})

#### <font color = pink> Finish Name col w duplicates

In [None]:
df[df.duplicated('name', keep=False)].sort_values(by='name')

Unnamed: 0,type,borough,name,phone,location,latitude,longitude,zipcode
74,Child Health Center,Manhattan,Bellevue Hospital Center,212-562-4141,"462 First Avenue New York, NY 10016",40.739173,-73.976862,10016
37,Acute Care Hospital,Manhattan,Bellevue Hospital Center,212-562-4141,"462 First Avenue New York, NY 10016",40.739173,-73.976862,10016
65,Child Health Center,Brooklyn,Coney Island Hospital,718-616-3000,"2601 Ocean Parkway Brooklyn, NY 11235",40.586552,-73.966168,11235
44,Acute Care Hospital,Brooklyn,Coney Island Hospital,718-616-3000,"2601 Ocean Parkway Brooklyn, NY 11235",40.586552,-73.966168,11235
9,Child Health Center,Brooklyn,Cumberland Diagnostic & Treatment Center,718-260-7500,"100 Portland Avenue Brooklyn, NY 11205",40.695144,-73.976236,11205
32,Diagnostic & Treatment Center,Brooklyn,Cumberland Diagnostic & Treatment Center,718-260-7500,"100 Portland Avenue Brooklyn, NY 11205",40.695144,-73.976236,11205
42,Child Health Center,Brooklyn,East New York Diagnostic & Treatment Center,718-240-0400,"2094 Pitkin Avenue Brooklyn, NY 11225",40.671977,-73.895248,11225
17,Diagnostic & Treatment Center,Brooklyn,East New York Diagnostic & Treatment Center,718-240-0400,"2094 Pitkin Avenue Brooklyn, NY 11207",40.671977,-73.895248,11207
47,Child Health Center,Queens,Elmhurst Hospital Center,718-334-4000,"79 01 Elmhurst, NY 11373",40.73871,-73.878351,11373
1,Acute Care Hospital,Queens,Elmhurst Hospital Center,718-334-4000,"79 01 Elmhurst, NY 11373",40.73871,-73.878351,11373


In [None]:
cols = list(df.columns)
cols.remove('type')

In [None]:
df_temp = df.groupby(cols).agg(lambda x: tuple(x)).applymap(list).reset_index()

In [None]:
df = df_temp.drop_duplicates('name', keep='first')

In [None]:
df

Unnamed: 0,borough,name,phone,location,latitude,longitude,zipcode,type
0,Bronx,Daniel Webster Houses Child Health Clinic,718-538-2147,"401 168th Street Bronx, NY 10456",40.832280,-73.909535,10456,[Child Health Center]
1,Bronx,Gunhill Health Center,718-918-8850,"1012 Gunhill Road Bronx, NY 10469",40.874636,-73.857840,10469,[Child Health Center]
2,Bronx,Health Center at Tremont,718-918-8700,"1826 Arthur Avenue Bronx, NY 10457",40.844083,-73.894482,10457,[Child Health Center]
3,Bronx,Jacobi Medical Center,718-918-5000,"1400 Pelham Parkway Bronx, NY 10461",40.857427,-73.847079,10461,"[Acute Care Hospital, Child Health Center]"
4,Bronx,Lincoln Medical & Mental Health Center,718-579-5000,"234 149th Street Bronx, NY 10451",40.817688,-73.924200,10451,[Child Health Center]
...,...,...,...,...,...,...,...,...
61,Queens,Springfield Gardens Medical Center,718-883-6800,134 64 Springfield Blvd Springfield Gardens NY...,40.678997,-73.754077,11413,[Child Health Center]
62,Queens,Woodside Houses Child Health Clinic,718-334-6140,"50 53 Newtown Road Woodside, NY 11377",40.753164,-73.910752,11377,[Child Health Center]
63,Staten Island,Mariner's Harbor Houses Child Health Clinic,718-761-2060,2040 Forest Avenue Staten Island NY 10303,40.626017,-74.156541,10303,[Child Health Center]
64,Staten Island,Sea View Hospital Rehabilitation Center & Home,718-317-3000,460 Brielle Avenue Staten Island NY 10314,40.593798,-74.135437,10314,[Nursing Home]


### <font color = blue> Save and export data

In [None]:
save_path = '/content/drive/My Drive/CIS 450/CIS 450 Project/hospitals_cleaned.csv'

In [None]:
df.to_csv(save_path, index = False)

###<font color=blue> Summary Statistics

In [None]:
df.shape

(62, 8)

In [None]:
df.dtypes

borough       object
name          object
phone         object
location      object
latitude     float64
longitude    float64
zipcode        int64
type          object
dtype: object

In [None]:
100 * df.borough.value_counts() / df.shape[0]

Brooklyn         33.870968
Manhattan        30.645161
Bronx            16.129032
Queens           14.516129
Staten Island     4.838710
Name: borough, dtype: float64