<h1 align='center'> Foursquare Data Collection</h1>

In [1115]:
import pandas as pd
import numpy as np
import geopandas as gp
import folium
import requests
from geopy import distance
import re
from shapely.geometry import Point

### Local Cleaned Data

From [Previous File of Data Collection](https://github.com/gokulmuthiah/Coursera_Capstone/blob/master/Capstone-Data-Collection.ipynb)

In [1507]:
df=gp.read_file("Data/Cleaned/data.shp")
df.crs={'init': 'epsg:4326'}
df.head()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,medRent,medIncome,meanIncome,veryLow,low,middle,high,veryHigh,AADT,geometry
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,2278.0,125817.0,205275.0,1279,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ..."
1,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,40.5883,-73.941511,10.214922,Brooklyn,67681,43.9,6625,59559.0,...,1180.0,57150.0,79613.0,6637,5298,6797,5785,1633,12013.8,POLYGON ((-73.91809256480843 40.58657033500475...
2,BK19,Brighton Beach,40.580922,-73.961217,2.770061,Brooklyn,35811,44.3,12927,31514.0,...,1194.0,36802.0,63703.0,5762,2697,3301,2006,791,12252.733333,POLYGON ((-73.96034953585246 40.58730628557137...
3,BK21,Seagate-Coney Island,40.57648,-73.991231,6.242851,Brooklyn,31132,39.0,4986,27396.0,...,676.0,27345.0,49358.0,5381,2285,2158,1115,297,15428.3,POLYGON ((-73.97459000582634 40.58313882075885...
4,BK23,West Brighton,40.579088,-73.973391,1.409979,Brooklyn,16436,58.0,11656,14464.0,...,905.0,40316.0,58752.0,3169,1790,2212,955,275,8256.6,"POLYGON ((-73.9688899587795 40.57526123899416,..."


### Foursquare Options

In [51]:
CLIENT_ID="SQQXQW23MRDH3TM4FCYAGGBG4KI5TNOMNMJYZTPSX0QFMRW4"
CLIENT_SECRET="RLT452NQ14X0VBLLKQOB4ZIDN3RQACAUN4B2Z1NRHOE21OOG"
VERSION="20190726"

### Collect Foursquare Categories for Aggregation

In [143]:
catUrl="https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}"\
        .format(CLIENT_ID,CLIENT_SECRET,VERSION)
catJson = requests.get(catUrl).json()

#### Get the List of all categories within the nested JSON file

In [832]:
import collections
def getSubCategories(cats,keywords,depth,parent="None"):
    subCategories=[]
    if(keywords in cats['name'].split() or keywords==""):
        subCategories.append((cats['id']+";"+cats['name']+";"+str(depth)+";"+parent))
    if(depth>0):
        if(cats['categories']!=[]):
            for i in cats['categories']:
                subCategories.append(getSubCategories(i,keywords,depth-1,cats['name']))
    if (subCategories!=[]):
        return subCategories
    
categoryList=[getSubCategories(i,'',4) for i in catJson['response']['categories']]

#### Flatten them and make them into a dataframe with parent and depth characteristics

In [863]:
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

def fun(x):
    if isinstance(x, list):
        return x[0].split(';')
    else:
        return x.split(';')

tmp=pd.DataFrame(list(flatten(categoryList)),columns=['main'])
tmp['main']=tmp['main'].apply(fun)
categoryDf=pd.DataFrame(columns=['id','category','depth','parent'])
for i,row in tmp.iterrows():
    categoryDf.loc[i,'id']=row[0][0]
    categoryDf.loc[i,'category']=row[0][1]
    categoryDf.loc[i,'depth']=row[0][2]
    categoryDf.loc[i,'parent']=row[0][3]
categoryDf['depth']=4-(categoryDf['depth'].astype(int))
categoryDf.head()

Unnamed: 0,id,category,depth,parent
0,4d4b7104d754a06370d81259,Arts & Entertainment,0,
1,56aa371be4b08b9a8d5734db,Amphitheater,1,Arts & Entertainment
2,4fceea171983d5d06c3e9823,Aquarium,1,Arts & Entertainment
3,4bf58dd8d48988d1e1931735,Arcade,1,Arts & Entertainment
4,4bf58dd8d48988d1e2931735,Art Gallery,1,Arts & Entertainment


#### Some Functions to access category Data

In [1084]:
def getCategory(string):
    tmp=[]
    for i,x in Categories.iterrows():
        for words in string.split(','):
            if(bool(re.match('.*'+words+'.*',x['category']))):
#                 print(x['category'])
                tmp.append(x['id'])
    return tmp


def getParent(category,parentDepth):
    if (categoryDf.loc[categoryDf['category']==category,'depth'].values[0]==parentDepth):
        return category
    else:
        return getParent(categoryDf.loc[categoryDf['category']==category,'parent'].values[0],parentDepth)
    
    
def getDepth(category):
        return categoryDf.loc[categoryDf['category']==category,'depth'].values[0]

### Radius of Neighborhoods in API calls

In [462]:
bbs=df.geometry.bounds
radii=[]
for i,pt in bbs.iterrows():
    radii.append((distance.geodesic((pt['minx'],pt['miny']),(pt['maxx'],pt['maxy']),ellipsoid='WGS-84').m/3))

### API Call Function

In [741]:
def makeAPICall(queryType,place,radius,cat,section=""):
    url="https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit=200&query={}&section={}"\
    .format(queryType,CLIENT_ID,CLIENT_SECRET,VERSION,place['lat'],place['long'],radius,cat,section)
    result = requests.get(url).json()
    return result

### get Nearby Cultural Zones & Tourism Landmarks Data

#### Do not run cell because it makes several api calls and the Data has already been persisted

In [601]:
from IPython.display import clear_output
siteCategories=getCategory("Site,Beach,Bridge,Museum,Art")
siteCategories=siteCategories[1:9]+["4bf58dd8d48988d12d941735"]#Monuments added
siteCategories=','.join(siteCategories)
venues=[]
for i,row in df.iterrows():
    landJson=makeAPICall('search',row,radii[i],siteCategories)
    clear_output()
    print("Number of Neighborhood Data Obtained:",i+1)
    items=landJson['response']['venues']
    for item in items:
        venues.append([row['code'],
                       item['name'],
                       item['categories'][0]['name'],
                       row['name'],
                       item['location']["lat"],
                       item['location']["lng"],
                        ])

Number of Neighborhood Data Obtained: 188


In [760]:
landmarks=pd.DataFrame(data=venues,columns=["code","venue","category","name",'vLat','vLong'])
landmarks=landmarks.groupby('category').filter(lambda x: len(x) >= 10)
remove=['Dance Studio','School','High School','Building','Tattoo Parlor']
landmarks=landmarks.loc[~landmarks['category'].isin(remove)].reset_index(drop=True)

In [761]:
landmarks.to_csv("Data/Cleaned/LandmarkData.csv")
landmarks.head()

Unnamed: 0,code,venue,category,name,vLat,vLong
0,BK09,Pier 4 Beach,Beach,Brooklyn Heights-Cobble Hill,40.696595,-73.999084
1,BK09,Brooklyn War Memorial,Monument / Landmark,Brooklyn Heights-Cobble Hill,40.698657,-73.990776
2,BK09,Brooklyn Historical Society,History Museum,Brooklyn Heights-Cobble Hill,40.694942,-73.992333
3,BK09,The Heights Players Theatre,Indie Theater,Brooklyn Heights-Cobble Hill,40.693005,-73.998293
4,BK09,Truman Capote House,Historic Site,Brooklyn Heights-Cobble Hill,40.698906,-73.99495


### Get Food Outlet Data from Foursquare

#### Do not run cell because it makes several api calls and the Data has already been persisted

In [657]:
foodCategories=getCategory("Food")[0]
foodVenues=[]
missedRows=[]
for i,row in df.iterrows():
    foodJson=makeAPICall('explore',row,radii[i],foodCategories,'food')
    if(foodJson['meta']['code']==200):
        clear_output()
        print("Number of Neighborhood Data Obtained:",i+1)
        items=foodJson['response']['groups'][0]['items']
        for item in items:
            foodVenues.append([row['code'],
                               item['venue']['name'],
                               item['venue']['categories'][0]['name'],
                               row['name'],
                               item['venue']['location']["lat"],
                               item['venue']['location']["lng"],
                            ])
    else:
        missedRows.append(row['code'])
        print('Failed Read',row['name'])

Number of Neighborhood Data Obtained: 188


In [762]:
foods=pd.DataFrame(data=foodVenues,columns=["code","venue","category","name",'vLat','vLong'])
foods=foods.groupby('category').filter(lambda x: len(x) >= 15).reset_index(drop=True)

In [763]:
foods.to_csv("Data/Cleaned/foodData.csv")
foods.head()

Unnamed: 0,code,venue,category,name,vLat,vLong
0,BK09,Dellarocco's,Pizza Place,Brooklyn Heights-Cobble Hill,40.694992,-73.995924
1,BK09,Lassen & Hennigs,Deli / Bodega,Brooklyn Heights-Cobble Hill,40.69497,-73.994857
2,BK09,Sushi Gallery,Sushi Restaurant,Brooklyn Heights-Cobble Hill,40.697595,-73.993236
3,BK09,Saketumi Asian Bistro,Asian Restaurant,Brooklyn Heights-Cobble Hill,40.69491,-73.994578
4,BK09,Iron Chef House,Japanese Restaurant,Brooklyn Heights-Cobble Hill,40.697406,-73.99256


### Get Shops Data from Foursquare

#### Do not run cell because it makes several api calls and the Data has already been persisted

In [823]:
shopCategories=["4d4b7105d754a06378d81259","4bf58dd8d48988d175941735"]
shopCategories=','.join(shopCategories)
shopCategories='gym'
shopVenues=[]
gymVenues=[]
missedRows=[]
for i,row in df.iterrows():
    shopJson=makeAPICall('explore',row,radii[i],shopCategories,'shops')
    if(shopJson['meta']['code']==200):
        clear_output()
        print("Number of Neighborhood Data Obtained:",i+1)
        items=shopJson['response']['groups'][0]['items']
        for item in items:
            shopVenues.append([row['code'],
                               item['venue']['name'],
                               item['venue']['categories'][0]['name'],
                               row['name'],
                               item['venue']['location']["lat"],
                               item['venue']['location']["lng"],
                            ])
    else:
        missedRows.append(row['code'])
        print('Failed Read:',shopJson['meta']['code'],row['name'])

Number of Neighborhood Data Obtained: 188


In [820]:
shops=pd.DataFrame(data=gymVenues,columns=["code","venue","category","name",'vLat','vLong'])
shops=shops.append(gyms).sort_values('code').reset_index(drop=True)#['category'].value_counts()#.head(120)

In [821]:
shops.to_csv("Data/Cleaned/shopData.csv")

### Intermediate Backup
### Get Data from saved Csv files

In [1100]:
shops=pd.read_csv("Data/Cleaned/shopData.csv",index_col=0)
shops=shops.drop_duplicates(['venue','vLat','vLong']).reset_index(drop=True)

In [1359]:
foods=pd.read_csv("Data/Cleaned/foodData.csv",index_col=0)
foods=foods.drop_duplicates(['venue','vLat','vLong']).reset_index(drop=True)

In [1105]:
landmarks=pd.read_csv("Data/Cleaned/LandmarkData.csv",index_col=0)
landmarks=landmarks.drop_duplicates(['venue','vLat','vLong']).reset_index(drop=True)

## Aggregation of Categories

### Landmarks Data Aggregation

In [1360]:
def aggregate(x,depth=1):
    if(getDepth(x)>depth):
        return getParent(x,getDepth(x)-1)
    else:
        return x

In [1224]:
# landmarks['category']=landmarks['category'].apply(lambda x:aggregate(x,1))
landmarks.replace("Government Building","Monument / Landmark",inplace=True)
landmarks.drop(landmarks[landmarks['category'].isin(['Art Gallery','Bridge','Public Art','Park'])].index,inplace=True)
landmarks['category'].replace(['Historic Site','Memorial Site'],"Monument / Landmark",inplace=True)
landmarks['category'].replace(['Music Venue','Concert Hall','Event Space'],"Music Venue",inplace=True)
landmarks.reset_index(inplace=True,drop=True)
print(landmarks.shape)
landmarks.head()

(1268, 7)


Unnamed: 0,code,venue,category,name,vLat,vLong,geometry
0,BK09,Pier 4 Beach,Beach,Brooklyn Heights-Cobble Hill,40.696595,-73.999084,POINT (-73.99908363819122 40.69659539412925)
1,BK09,Brooklyn War Memorial,Monument / Landmark,Brooklyn Heights-Cobble Hill,40.698657,-73.990776,POINT (-73.99077645034042 40.69865700433354)
2,BK09,Brooklyn Historical Society,Museum,Brooklyn Heights-Cobble Hill,40.694942,-73.992333,POINT (-73.99233277851241 40.69494199303208)
3,BK09,The Heights Players Theatre,Performing Arts Venue,Brooklyn Heights-Cobble Hill,40.693005,-73.998293,POINT (-73.99829256821472 40.69300478161309)
4,BK09,Truman Capote House,Monument / Landmark,Brooklyn Heights-Cobble Hill,40.698906,-73.99495,POINT (-73.99495 40.698906)


### Aggregate and Process Shops Data

In [1138]:
# shops['category']=shops['category'].apply(aggregate) #long processing time
shops=shops.groupby('category').filter(lambda x: len(x) >= 148)
shops['category'].replace("Athletics & Sports","Gym / Fitness Center",inplace=True)
shops.drop(shops[shops['category'].isin(['Business Service','Lawyer','Home Service','Construction & Landscaping'])].index,inplace=True)
shops['category'].replace(['Cosmetics Shop','Jewelry Store'],"Health & Beauty Shop",inplace=True)
shops['category'].replace(['Convenience Store','Discount Store','Department Store'],"General Store",inplace=True)
shops['category'].replace(['Mobile Phone Shop'],"Electronics Store",inplace=True)
shops['category'].replace(['Flower Shop'],"Gift Shop",inplace=True)
shops.drop(shops[shops['category'].isin(['Business Service','Furniture / Home Store','Pet Store'])].index,inplace=True)
shops=shops.groupby('category').filter(lambda x: len(x) >= 300)
shops.reset_index(inplace=True,drop=True)
print(shops.shape)
shops.head()

(11065, 7)


Unnamed: 0,code,venue,category,name,vLat,vLong,geometry
0,BK09,The Heights Salon,Health & Beauty Service,Brooklyn Heights-Cobble Hill,40.694667,-73.993931,POINT (40.69466746466404 -73.9939305705351)
1,BK09,Waterfront Wines & Spirits,Food & Drink Shop,Brooklyn Heights-Cobble Hill,40.69389,-73.999239,POINT (40.69388997935272 -73.99923862706902)
2,BK09,European Wax Center,Health & Beauty Service,Brooklyn Heights-Cobble Hill,40.694699,-73.994151,POINT (40.69469946447693 -73.99415068928433)
3,BK09,Michael-Towne Wines And Spirits,Food & Drink Shop,Brooklyn Heights-Cobble Hill,40.697625,-73.992883,POINT (40.69762480919437 -73.99288255899214)
4,BK09,Borough Hall Greenmarket,Food & Drink Shop,Brooklyn Heights-Cobble Hill,40.693707,-73.990321,POINT (40.69370658297092 -73.99032073998625)


### Aggregate and Process Food Data
Note that the aggregation is being done manually and is being aggregated as accurately as possible and it is being done to ease the further analysis.

In [1139]:
# foods['category']=foods['category'].apply(lambda x:aggregate(x,1)) #long processing time
foods=foods.groupby('category').filter(lambda x: len(x) >= 150)
foods['category'].replace(['Bakery','Donut Shop','Bagel Shop','Café'],"Snacks / Coffee",inplace=True)
foods['category'].replace(['Sandwich Place','Burger Joint','Food Truck','Fried Chicken Joint',"Fast Food Restaurant"],"Fast Foods",inplace=True)
foods['category'].replace(['Mexican Restaurant','Spanish Restaurant','Caribbean Restaurant'],"Latin American Restaurant",inplace=True)
foods['category'].replace(['Indian Restaurant'],"Asian Restaurant",inplace=True)
foods['category'].replace(['Restaurant','Seafood Restaurant','Diner'],"American Restaurant",inplace=True)
foods.reset_index(drop=True,inplace=True)
print(foods.shape)
foods.head()

(9682, 7)


Unnamed: 0,code,venue,category,name,vLat,vLong,geometry
0,BK09,Dellarocco's,Pizza Place,Brooklyn Heights-Cobble Hill,40.694992,-73.995924,POINT (40.69499209990746 -73.99592429396137)
1,BK09,Lassen & Hennigs,Deli / Bodega,Brooklyn Heights-Cobble Hill,40.69497,-73.994857,POINT (40.69497028956407 -73.99485733614806)
2,BK09,Sushi Gallery,Asian Restaurant,Brooklyn Heights-Cobble Hill,40.697595,-73.993236,POINT (40.69759512293347 -73.99323607647035)
3,BK09,Saketumi Asian Bistro,Asian Restaurant,Brooklyn Heights-Cobble Hill,40.69491,-73.994578,POINT (40.69490951339117 -73.99457785175119)
4,BK09,Iron Chef House,Asian Restaurant,Brooklyn Heights-Cobble Hill,40.697406,-73.99256,POINT (40.69740573876739 -73.99256031665584)


### Persist Aggregated Data

In [1225]:
gLandmarks=gp.GeoDataFrame(landmarks,geometry=[Point(y,x) for x, y in zip(landmarks.vLat,landmarks.vLong)])
gFoods=gp.GeoDataFrame(foods,geometry=[Point(y,x) for x, y in zip(foods.vLat,foods.vLong)])
gShops=gp.GeoDataFrame(shops,geometry=[Point(y,x) for x, y in zip(shops.vLat,shops.vLong)])
gLandmarks.to_file("Data/Cleaned/Landmarks.shp")
gFoods.to_file("Data/Cleaned/Foods.shp")
gShops.to_file("Data/Cleaned/Shops.shp")

### Read Aggregated Data

In [1508]:
gLandmarks=gp.read_file("Data/Cleaned/Landmarks.shp")
gFoods=gp.read_file("Data/Cleaned/Foods.shp")
gShops=gp.read_file("Data/Cleaned/Shops.shp")
geoFile=df
gLandmarks.crs={'init': 'epsg:4326'}
gFoods.crs={'init': 'epsg:4326'}
gShops.crs={'init': 'epsg:4326'}

### Consolidate Point Data of Venues into Areas

In [1509]:
cultural=gp.sjoin(geoFile,gLandmarks,op='contains').drop(['index_right','code_right','name_right'],axis=1)
cultural.rename({'code_left':'code','name_left':'name'},axis=1,inplace=True)
cultural.reset_index(drop=True,inplace=True)
cultural.head() #['category'].value_counts()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,low,middle,high,veryHigh,AADT,geometry,venue,category,vLat,vLong
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",American Numismatic Society,Museum,40.689738,-73.992395
1,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Theater 219 (VPR),Performing Arts Venue,40.690359,-73.992481
2,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",New York Transit Museum,Museum,40.690469,-73.989963
3,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",ISSUE Project Room,Music Venue,40.69074,-73.989614
4,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Actors And Directors Workshop,Performing Arts Venue,40.691551,-73.990219


### Cultural Factor of Neighborhoods

This is a completely arbitrary factor which is supposed to be an indicator of the local attractions and the cultural venues in the Area. It is not a direct indicator, but it gives a comparative index for comparing locations. Different multiplication factors are appplied to the counts of the location category and the factors are valued based on the apparent importance of a category. It will be good only as a comparative indicator which is optimal for our purpose 

In [1510]:
def cultFactor(x):
    tmp=pd.Series(data=0,index=cultural['category'].unique())
    t=x.value_counts()+tmp
    t.fillna(0,inplace=True)
    return t['Monument / Landmark']*0.4+t['Museum']*0.4+t['Beach']*0.25+t['Music Venue']*0.25+t['Performing Arts Venue']*0.1
cultFactors=cultural.groupby('code')['category'].agg(cultFactor).reset_index()
cultFactors.rename({'category':'cultFactor'},axis=1,inplace=True)
cultFactors.tail()

Unnamed: 0,code,cultFactor
158,SI35,1.3
159,SI36,1.9
160,SI37,2.0
161,SI45,2.35
162,SI54,1.15


In [1511]:
tm=df.merge(cultFactors,how='left')
tm['cultFactor'].fillna(0,inplace=True)
df=tm

### Consolidate Restaurant Venues into Number of Restaurants in Area by category

In [1512]:
foodDF=gp.sjoin(geoFile,gFoods,op='contains').drop(['index_right','code_right','name_right'],axis=1)
foodDF.rename({'code_left':'code','name_left':'name'},axis=1,inplace=True)
foodDF.reset_index(drop=True,inplace=True)
foodDF['category'].replace('Diner','American Restaurant',inplace=True)
foodDF.head()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,low,middle,high,veryHigh,AADT,geometry,venue,category,vLat,vLong
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Saketumi Asian Bistro,Asian Restaurant,40.69491,-73.994578
1,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Aketumi,Asian Restaurant,40.694914,-73.994673
2,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Lassen & Hennigs,Deli / Bodega,40.69497,-73.994857
3,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Dellarocco's,Pizza Place,40.694992,-73.995924
4,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Montague Street Bagels,Snacks / Coffee,40.695007,-73.995076


In [1513]:
tmp=pd.DataFrame(foodDF.groupby('code')['category'].value_counts())
tmp=tmp.rename({'category':'count'},axis=1).reset_index()
piv=tmp.pivot(index='code',columns='category',values='count').fillna(0).reset_index()
piv.index.name=None
piv.columns.name=None
piv.head()

Unnamed: 0,code,American Restaurant,Asian Restaurant,Deli / Bodega,Fast Foods,Italian Restaurant,Latin American Restaurant,Pizza Place,Snacks / Coffee
0,BK09,5.0,14.0,8.0,9.0,4.0,3.0,5.0,12.0
1,BK17,16.0,12.0,4.0,13.0,6.0,1.0,10.0,13.0
2,BK19,4.0,7.0,4.0,5.0,0.0,1.0,2.0,7.0
3,BK21,8.0,2.0,7.0,10.0,1.0,8.0,9.0,6.0
4,BK23,4.0,2.0,4.0,6.0,0.0,0.0,4.0,2.0


In [1514]:
df=df.merge(piv)

### Consolidate Shops into presence of shops in Area by category

In [1515]:
shopDF=gp.sjoin(geoFile,gShops,op='contains').drop(['index_right','code_right','name_right'],axis=1)
shopDF.rename({'code_left':'code','name_left':'name'},axis=1,inplace=True)
shopDF.reset_index(drop=True,inplace=True)
shopDF.head()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,low,middle,high,veryHigh,AADT,geometry,venue,category,vLat,vLong
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Michael-Towne Wines And Spirits,Food & Drink Shop,40.697625,-73.992883
1,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Careland Pharmacy,Pharmacy,40.697779,-73.992403
2,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Eastern Athletic Club,Gym / Fitness Center,40.698337,-73.993948
3,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",Heights Nails,Health & Beauty Service,40.699238,-73.99211
4,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ...",YogaWorks Brooklyn,Gym / Fitness Center,40.69221,-73.990985


In [1516]:
tmp=pd.DataFrame(shopDF.groupby('code')['category'].value_counts())
tmp=tmp.rename({'category':'count'},axis=1).reset_index()
piv=tmp.pivot(index='code',columns='category',values='count').fillna(0).reset_index()
piv.index.name=None
piv.columns.name=None
piv.head()

Unnamed: 0,code,Clothing Store,Electronics Store,Food & Drink Shop,General Store,Gift Shop,Gym / Fitness Center,Health & Beauty Service,Pharmacy
0,BK09,7.0,6.0,9.0,0.0,2.0,22.0,12.0,7.0
1,BK17,5.0,9.0,20.0,8.0,1.0,24.0,3.0,10.0
2,BK19,2.0,10.0,22.0,3.0,2.0,7.0,9.0,10.0
3,BK21,7.0,6.0,12.0,3.0,2.0,4.0,5.0,7.0
4,BK23,0.0,1.0,4.0,2.0,4.0,6.0,1.0,4.0


### Merge all data into a single Dataframe

In [1522]:
df=df.merge(piv)
df.head()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,Pizza Place,Snacks / Coffee,Clothing Store,Electronics Store,Food & Drink Shop,General Store,Gift Shop,Gym / Fitness Center,Health & Beauty Service,Pharmacy
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,5.0,12.0,7.0,6.0,9.0,0.0,2.0,22.0,12.0,7.0
1,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,40.5883,-73.941511,10.214922,Brooklyn,67681,43.9,6625,59559.0,...,10.0,13.0,5.0,9.0,20.0,8.0,1.0,24.0,3.0,10.0
2,BK19,Brighton Beach,40.580922,-73.961217,2.770061,Brooklyn,35811,44.3,12927,31514.0,...,2.0,7.0,2.0,10.0,22.0,3.0,2.0,7.0,9.0,10.0
3,BK21,Seagate-Coney Island,40.57648,-73.991231,6.242851,Brooklyn,31132,39.0,4986,27396.0,...,9.0,6.0,7.0,6.0,12.0,3.0,2.0,4.0,5.0,7.0
4,BK23,West Brighton,40.579088,-73.973391,1.409979,Brooklyn,16436,58.0,11656,14464.0,...,4.0,2.0,0.0,1.0,4.0,2.0,4.0,6.0,1.0,4.0


### Persist Data locally

In [1519]:
df.to_file("Data/finalData.shp")
df.to_csv("Data/finalData.csv")

The Data Analysis is continued in the [File](https://github.com/gokulmuthiah/Coursera_Capstone/blob/master/Capstone-Data-Analysis.ipynb)