<h1 align='center'> Foursquare Data Collection</h1>

### Import Necessary Modules

In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import folium
import requests
from geopy import distance
import re
from shapely.geometry import Point
import collections
import json as JSON
from IPython.display import clear_output
import seaborn as sns

### Read the Pre-processed Data:
Continued from: [Previous File - Data Collection](https://github.com/gokulmuthiah/Coursera_Capstone/blob/master/Capstone-Data-Collection.ipynb)

In [2]:
df=gp.read_file("Data/Cleaned/data.shp")
df.crs={'init': 'epsg:4326'}
df.head()

Unnamed: 0,code,name,lat,long,area,borough,population,medAge,popDensity,dayPop,...,medRent,medIncome,meanIncome,veryLow,low,middle,high,veryHigh,AADT,geometry
0,BK09,Brooklyn Heights-Cobble Hill,40.695469,-73.994871,1.615327,Brooklyn,24212,37.1,14988,21307.0,...,2278.0,125817.0,205275.0,1279,1201,2008,3355,3272,11051.190476,"POLYGON ((-73.99236367043254 40.689690123777, ..."
1,BK17,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,40.5883,-73.941511,10.214922,Brooklyn,67681,43.9,6625,59559.0,...,1180.0,57150.0,79613.0,6637,5298,6797,5785,1633,12013.8,POLYGON ((-73.91809256480843 40.58657033500475...
2,BK19,Brighton Beach,40.580922,-73.961217,2.770061,Brooklyn,35811,44.3,12927,31514.0,...,1194.0,36802.0,63703.0,5762,2697,3301,2006,791,12252.733333,POLYGON ((-73.96034953585246 40.58730628557137...
3,BK21,Seagate-Coney Island,40.57648,-73.991231,6.242851,Brooklyn,31132,39.0,4986,27396.0,...,676.0,27345.0,49358.0,5381,2285,2158,1115,297,15428.3,POLYGON ((-73.97459000582634 40.58313882075885...
4,BK23,West Brighton,40.579088,-73.973391,1.409979,Brooklyn,16436,58.0,11656,14464.0,...,905.0,40316.0,58752.0,3169,1790,2212,955,275,8256.6,"POLYGON ((-73.9688899587795 40.57526123899416,..."


## Preparation for Data Collection from Foursquare


#### Foursquare Credentials

In [12]:
CLIENT_ID="SQQXQW23MRDH3TM4FCYAGGBG4KI5TNOMNMJYZTPSX0QFMRW4"
CLIENT_SECRET="RLT452NQ14X0VBLLKQOB4ZIDN3RQACAUN4B2Z1NRHOE21OOG"
VERSION="20190815"

### Get Categories of Data from Foursquare

The list of all possible categories that can be obtained from Foursquare is initally collected and processed for esay working later on in the project 

In [13]:
catUrl="https://api.foursquare.com/v2/venues/categories"
catParams=dict(client_id=CLIENT_ID,
               client_secret=CLIENT_SECRET,
               v=VERSION)
catJson = requests.get(catUrl,params=catParams).json()

In [7]:
def getSubCategories(cats,keywords,depth,parent="None"):
    subCategories=[]
    if(keywords in cats['name'].split() or keywords==""):
        subCategories.append((cats['id']+";"+cats['name']+";"+str(depth)+";"+parent))
    if(depth>0):
        if(cats['categories']!=[]):
            for i in cats['categories']:
                subCategories.append(getSubCategories(i,keywords,depth-1,cats['name']))
    if (subCategories!=[]):
        return subCategories
    
def getCategory(string):
    tmp=[]
    for i,x in categoryDf.iterrows():
        for words in string.split(','):
            if(bool(re.match('.*'+words+'.*',x['category']))):
                print(x['category'])
                tmp.append(x['id'])
    return tmp

def getParent(category,parentDepth):
    if (categoryDf.loc[categoryDf['category']==category,'depth'].values[0]<=parentDepth):
        return category
    else:
        return getParent(categoryDf.loc[categoryDf['category']==category,'parent'].values[0],parentDepth)
    
def getDepth(category):
        return categoryDf.loc[categoryDf['category']==category,'depth'].values[0]

##### Functions to flatten category hierarchy and make them into a dataframe with parent and depth characteristics

In [8]:
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

def fun(x):
    if isinstance(x, list):
        return x[0].split(';')
    else:
        return x.split(';')

#### Categories Dataframe

In [14]:
categoryList=[getSubCategories(i,'',4) for i in catJson['response']['categories']]

tmp=pd.DataFrame(list(flatten(categoryList)),columns=['main'])
tmp['main']=tmp['main'].apply(fun)
categoryDf=pd.DataFrame(columns=['id','category','depth','parent'])
for i,row in tmp.iterrows():
    categoryDf.loc[i,'id']=row[0][0]
    categoryDf.loc[i,'category']=row[0][1]
    categoryDf.loc[i,'depth']=row[0][2]
    categoryDf.loc[i,'parent']=row[0][3]
categoryDf['depth']=4-(categoryDf['depth'].astype(int))
categoryDf.head()

Unnamed: 0,id,category,depth,parent
0,4d4b7104d754a06370d81259,Arts & Entertainment,0,
1,56aa371be4b08b9a8d5734db,Amphitheater,1,Arts & Entertainment
2,4fceea171983d5d06c3e9823,Aquarium,1,Arts & Entertainment
3,4bf58dd8d48988d1e1931735,Arcade,1,Arts & Entertainment
4,4bf58dd8d48988d1e2931735,Art Gallery,1,Arts & Entertainment


## API Data Collection

### Function to make API calls to Foursquare
Different parameters based on queries

In [330]:
def makeAPICall(query,place=None,rad=500,section="",offset=0,price="",q=""):
    if (query=='explore'):
        url="https://api.foursquare.com/v2/venues/explore"
        params=dict(client_id=CLIENT_ID,
                    client_secret=CLIENT_SECRET,
                    v=VERSION,
                    ll=str(place['lat'])+','+str(place['long']),
                    radius=rad,
                    section=section,
                    limit=500,
                    offset=offset,
                    price=price,
                    query=q
                   )
        
    elif (query=='search'):
        url="https://api.foursquare.com/v2/venues/search"
        params=dict(client_id=CLIENT_ID,
                    client_secret=CLIENT_SECRET,
                    v=VERSION,
                    ll=str(place['lat'])+','+str(place['long']),
                    radius=rad,
                    intent="browse",
                    categoryId=q,
                    limit=500,
                   ) 
    else:
        url="https://api.foursquare.com/v2/venues/{}".format(query)
        params=dict(client_id=CLIENT_ID,
                    client_secret=CLIENT_SECRET,
                    v=VERSION,)
        
    result = requests.get(url,params=params)
    if result.status_code==200:
        return result.json()
    else:
        print("Failed")
        return {'meta':{'code':400}}

#### Radius of Neighborhood
The radius parameter to be used in the API Calls. The radius varies based upon the Area of the concerned Neighborhood

In [323]:
bbs=df.geometry.bounds
radii=[]
for i,pt in bbs.iterrows():
    radii.append(round((distance.geodesic((pt['minx'],pt['miny']),(pt['maxx'],pt['maxy']),ellipsoid='WGS-84').m/3)))

### Function to make API Calls and parse the results
The function makes API calls for every element in the initial dataframe and can make a variety of queries and can also iterate through various pages and price ranges. It also parses the response and returns a dictionary ready to be processed into a Dataframe.

In [333]:
def getData(df,section="",queryType="explore",query=""):
    Data=[]
    missedRows=[]
    items=[None]*100
    for index,place in df.iterrows():
        for price in range(1,5):
            offset=0
            while True:
                json=makeAPICall(queryType,place,radii[index],section,offset,price,query)
                if(json['meta']['code']==200 and queryType=='explore'):
                    items=json['response']['groups'][0]['items']
                    offset+=1
                    for item in items:
                        Data.append({
                            'code':place['code'],
                            'id':item['venue']['id'],
                            'name':item['venue']['name'],
                            'category':item['venue']['categories'][0]['name'],
                            'price':price,
                            'lat':item['venue']['location']["lat"],        
                            'long':item['venue']['location']["lng"],
                            'radii':radii[index],
                        })
                    if(len(items)<100):
                        break
                        
                elif(json['meta']['code']==200 and queryType=='search'):
                    items=json['response']['venues']
                    offset+=1
                    for item in items:
                        Data.append({
                            'code':place['code'],
                            'id':item['id'],
                            'name':item['name'],
                            'category':item['categories'][0]['name'],
                            'price':price,
                            'lat':item['location']["lat"],        
                            'long':item['location']["lng"],
                            'radii':radii[index],
                        })
                    if(len(items)<100):
                        break
                        
                else:
                    missedRows.append([place['code'],price,offset])
                    print('Failed Read',place['name'])
                    break  
        clear_output()
        print("Current Location:{}.{}\n Total Entries:{} ".format(index,place['name'],len(Data)))
    return Data,missedRows

### Make API Calls and get Data
We make API Calls to get varieties of Data:
<ul>
    <li>Restaurants</li>
    <li>Coffee Shops</li>
    <li>Drinks</li>
    <li>Landmarks and Areas of Interest</li>
</ul>
Landmarks are used to find attractive places in a neighborhood and analyze its efffect on the placement of Restaurants in the place

The obtained Data is persisted in the local system as JSON files for further Processing

In [334]:
# restaurants,missed=getData(df,'food')
# with open("Data.json",'w') as file:
#     JSON.dump(restaurants,file)
    
# coffee,missed=getData(df,'coffee')
# with open("coffee.json",'w') as file:
#     JSON.dump(coffee,file)

# drinks,missed=getData(df,'drinks')
# with open("drinks.json",'w') as file:
#     JSON.dump(drinks,file)
    
# arts,missed=getData(df,'arts')
# with open("arts.json",'w') as file:
#     JSON.dump(arts,file)

# siteCategories=getCategory("Site,Landmark")
# siteCategories=','.join(siteCategories)
# landmarks,missed=getData(df,queryType='search',query=siteCategories)
# with open("landmarks.json",'w') as file:
#     JSON.dump(landmarks,file)

Current Location:187.Great Kills
 Total Entries:3928 


#### A Function to aggregate the various categories of restaurants to their parent Categories automatically

In [9]:
def aggregate(series,depth=1):
    aggSeries=[]
    for i,category in series.iteritems():
        aggSeries.append(getParent(category,depth))
    series=[]
    for category in aggSeries:
        if category=="Chinese Restaurant" or category=="Japanese Restaurant":
            series.append(category)
        else:
            series.append(getParent(category,1))
    return pd.Series(series)

### Read locally persisted Data into Dataframes
The various Categories of Data available are seen in the Outputs of the following cells

In [15]:
with open("Data/Data.json",'r') as file:    
    restaurantJson=JSON.load(file)

restaurants=pd.DataFrame(restaurantJson)
restaurants.drop_duplicates(['lat','long','name'],inplace=True)
restaurants.reset_index(drop=True,inplace=True)
restaurants['category']=aggregate(restaurants['category'],2)
restaurants['category'].value_counts()[:10]

Pizza Place            1288
Deli / Bodega          1148
Chinese Restaurant      937
Asian Restaurant        705
Italian Restaurant      698
Mexican Restaurant      627
Bakery                  582
Japanese Restaurant     555
American Restaurant     500
Sandwich Place          496
Name: category, dtype: int64

In [16]:
with open("Data/coffee.json",'r') as file:    
    coffee=JSON.load(file)

coffee=pd.DataFrame(coffee)
coffee.drop_duplicates(['lat','long','name'],inplace=True)
coffee.reset_index(drop=True,inplace=True)
coffee['category']=aggregate(coffee['category'],1)
coffee['category'].value_counts()[:10]

Coffee Shop            1103
Café                    821
Donut Shop              533
Bakery                  131
Tea Room                100
Dessert Shop             49
Sandwich Place           40
American Restaurant      33
Bar                      31
Diner                    31
Name: category, dtype: int64

In [17]:
with open("Data/drinks.json",'r') as file:    
    drinks=JSON.load(file)

drinks=pd.DataFrame(drinks)
drinks.drop_duplicates(['lat','long','name'],inplace=True)
drinks.reset_index(drop=True,inplace=True)
drinks['category']=aggregate(drinks['category'],1)
drinks['category'].value_counts()[:10]

Bar                    2330
Lounge                  333
Nightclub               180
American Restaurant     129
Italian Restaurant       72
Asian Restaurant         69
Gastropub                60
Brewery                  53
Mexican Restaurant       45
Restaurant               36
Name: category, dtype: int64

In [341]:
with open("Data/arts.json",'r') as file:    
    arts=JSON.load(file)

arts=pd.DataFrame(arts)
arts.drop_duplicates(['lat','long','name'],inplace=True)
arts.reset_index(drop=True,inplace=True)
arts['category']=aggregate(arts['category'],1)
arts['category'].value_counts()[:10]

Performing Arts Venue    913
Art Gallery              664
Music Venue              369
Museum                   180
Movie Theater            161
Public Art                84
Exhibit                   79
Concert Hall              71
Comedy Club               33
Tour Provider             27
Name: category, dtype: int64

In [343]:
with open("Data/landmarks.json",'r') as file:    
    landmarks=JSON.load(file)

landmarks=pd.DataFrame(landmarks)
landmarks.drop_duplicates(['lat','long','name'],inplace=True)
landmarks.reset_index(drop=True,inplace=True)
landmarks['category']=aggregate(landmarks['category'])
landmarks['category'].replace("Government Building","Landmarks",inplace=True)
landmarks['category'].value_counts()[:3]

Historic Site    483
Landmarks        175
Memorial Site     27
Name: category, dtype: int64

## Process Target Data

### Manual Aggregation
The aggregation of various types of Restaurants into similar and related cuisines is done manually to reduce the dimensions.

The Data from the Restaurants,Coffee Shops and Drinks outlets are merged into a single Dataframe and duplicates are eliminated 

In [18]:
Data=restaurants.append(coffee).append(drinks)#.append(arts)
Data.drop_duplicates(['lat','long','name'])
Data.reset_index(drop=True,inplace=True)
Data['category'].value_counts()[Data['category'].value_counts()>100][:10]

Bar                   2361
Pizza Place           1317
Café                  1290
Deli / Bodega         1171
Coffee Shop           1119
Donut Shop             957
Chinese Restaurant     937
Italian Restaurant     794
Asian Restaurant       786
Bakery                 714
Name: category, dtype: int64

The list of aggregation of Categories

In [19]:
european=[
    "Eastern European Restaurant",
    "German Restaurant",
    "Australian Restaurant",
    "Russian Restaurant",
    "Austrian Restaurant",    
    "Creperie",
    "Scandinavian Restaurant",
    "English Restaurant",
    "Swiss Restaurant",
    "Polish Restaurant",
    "Irish Pub",
    "Ukrainian Restaurant",
    "Modern European Restaurant",
    "Czech Restaurant",
    "Greek Restaurant",
    "French Restaurant",
]

middleEastern=[
    "Falafel Restaurant",
    "Turkish Restaurant",
    "Afghan Restaurant",
    "African Restaurant",
    "Kebab Restaurant",
    "Middle Eastern Restaurant",
    "Mediterranean Restaurant",
]

latinAmerican=[
    "Caribbean Restaurant",
    "Portuguese Restaurant",
    "Spanish Restaurant",
    "Mexican Restaurant",
    "Hawaiian Restaurant"
]

fastFoods=[
    "Food Truck",
    "Sandwich Place",
    "Fried Chicken Joint",
    "Burger Joint",
    "BBQ Joint",
    "Wings Joint",
    "Hot Dog Joint",
    "Snack Place",
    "Mac & Cheese Joint",
]

diners=[
    "Breakfast Spot",
    "Salad Place",
    "Soup Place",
    "Restaurant",
    "Vegetarian / Vegan Restaurant",
    "Southern / Soul Food Restaurant",
    "Salad Place",
]

snacks=[
    "Bakery",
    "Donut Shop",
    "Bagel Shop",
    "Dessert Shop"
]

Lounge=[
    "Gastropub",
    "Lounge",
    "Nightclub",
]

In [20]:
aggCategories=[]
for i,row in Data.iterrows():    
    x=row['category']
    if(x in european):
        aggCategories.append("European Restaurant")
    elif(x in middleEastern):
        aggCategories.append("Middle Eastern Restaurant")
    elif(x in latinAmerican):
        aggCategories.append("Latin American Restaurant")
    elif(x in fastFoods):
        aggCategories.append("Fast Food Restaurant")
    elif(x in snacks):
        aggCategories.append("Bakery/Dessert")
    elif((x=="Diner"or x in diners) and (row['price']==3 or row['price']==4)):
        aggCategories.append("American Restaurant")
    elif(x in diners):
        aggCategories.append("Diner")   
    elif(x=="American Restaurant" and row['price']==1):
        aggCategories.append("Diner")
    elif(x=="Dumpling Restaurant"):
        aggCategories.append("Chinese Restaurant")
    elif(x=="Brewery"):
        aggCategories.append("Bar")
    elif(x in Lounge):
        aggCategories.append("Nightclub/Lounge")
    elif(x=="Café" or x=="Tea Room"):
        aggCategories.append("Coffee Shop")
    else:
        aggCategories.append(x)

Data['category']=pd.Series(aggCategories)

### Condense Data into Neighborhoods

The aggregated Data is filtered to have only instances of above 100 and the data is made into a GeoDataframe

In [21]:
Data=Data.groupby('category').filter(lambda x:len(x)>100)
gData=gp.GeoDataFrame(Data.iloc[:,0],geometry=[Point(y,x) for x, y in zip(Data.lat,Data.long)]).reset_index(drop=True)
gData.crs={'init': 'epsg:4326'}
gData.shape

(19722, 2)

##### The indiviual Restaurants are aggregated into neighborhoods  with their counts being pivoted into Columns

In [22]:
newDf=gp.sjoin(df,gData).drop('index_right',axis=1).reset_index(drop=True)

In [23]:
tmp=pd.DataFrame(newDf.groupby('code')['category'].value_counts())
tmp=tmp.rename({'category':'count'},axis=1).reset_index()
piv=tmp.pivot(index='code',columns='category',values='count').fillna(0).reset_index()
piv.index.name=None
piv.columns.name=None
piv.head()

Unnamed: 0,code,American Restaurant,Asian Restaurant,Bakery/Dessert,Bar,Chinese Restaurant,Coffee Shop,Deli / Bodega,Diner,European Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Latin American Restaurant,Middle Eastern Restaurant,Nightclub/Lounge,Pizza Place,Seafood Restaurant,Steakhouse
0,BK09,2.0,4.0,12.0,7.0,3.0,12.0,8.0,5.0,1.0,9.0,1.0,5.0,6.0,4.0,2.0,2.0,6.0,0.0,1.0
1,BK17,5.0,1.0,16.0,13.0,3.0,15.0,7.0,10.0,4.0,17.0,0.0,7.0,9.0,1.0,8.0,3.0,10.0,6.0,0.0
2,BK19,1.0,2.0,10.0,3.0,1.0,12.0,2.0,4.0,6.0,5.0,2.0,0.0,4.0,1.0,2.0,2.0,2.0,1.0,0.0
3,BK21,4.0,2.0,6.0,6.0,1.0,5.0,6.0,5.0,0.0,16.0,0.0,1.0,0.0,10.0,1.0,3.0,10.0,2.0,0.0
4,BK23,1.0,0.0,2.0,3.0,1.0,2.0,4.0,5.0,0.0,6.0,0.0,1.0,1.0,0.0,1.0,1.0,4.0,0.0,0.0


##### The processed Data is Backed-Up

In [28]:
tmp=df.merge(piv)
tmp.to_file("Data/updatedData.shp")
with open("Data/Headers.json",'w') as file:
    JSON.dump(list(tmp.columns.values),file)

## Process Arts and landmarks Data 

In [356]:
Arts=arts.append(landmarks)
Arts.drop_duplicates(['lat','long','name'])
Arts=Arts.groupby('category').filter(lambda x:len(x)>10)
Arts.drop(Arts[Arts['category']=='Arts & Entertainment'].index,inplace=True)
Arts.drop(Arts[Arts['category']=='Tour Provider'].index,inplace=True)
Arts.replace("Comedy Club","Performing Arts Venue",inplace=True)
Arts.reset_index(drop=True,inplace=True)

The landmarks are seperated into major and minor landmarks based on their category and the footfall will be obtained only for major Landmarks because collecting footfall Data requires **Premium Foursquare Calls** and it is limited to 500 for a Free Account

In [635]:
smallLandmarks=Arts['category'].value_counts()[:4].index.values
Landmarks=Arts['category'].value_counts()[4:].index.values
smallLandmarks=Arts[Arts['category'].isin(smallLandmarks)].reset_index(drop=True)
Landmarks=Arts[Arts['category'].isin(Landmarks)].reset_index(drop=True)

### Collect major landmarks footfall

In [595]:
# with open("footfall.json",'r') as file:    
#     footfall=JSON.load(file)
# footfall=pd.DataFrame(footfall)
# ids=list(footfall['id'])
# newFootfall=[]
# missed=[]
# successful=0
# for i,row in Landmarks.iterrows():
#     if(row['id'] in ids):
#         continue
#     else:
#         json=makeAPICall(row['id'])
#         print(json['meta']['code'])
#         if (json['meta']['code']==200):
#             item=json['response']['venue']
#             if('rating' in item.keys()):
#                 newFootfall.append({
#                     'id':item['id'],
#                     'name':item['name'],
#                     'lat':item['location']["lat"],        
#                     'long':item['location']["lng"],
#                     'tips':item['stats']['tipCount'],
#                     'likes':item['likes']['count'],
#                     'rating':item['rating'],
#                     'photos':item['photos']['count']
#                     })
#             else:
#                 newFootfall.append({
#                     'id':item['id'],
#                     'name':item['name'],
#                     'lat':item['location']["lat"],        
#                     'long':item['location']["lng"],
#                     'tips':item['stats']['tipCount'],
#                     'likes':item['likes']['count'],
#                     'rating':np.NaN,
#                     'photos':item['photos']['count']
#                     })
#             successful+=1
#             clear_output()
#             print("Success: {}\nCount:{}".format(row['name'],successful))
            
#         elif(json['meta']['code']==400 or json['meta']['code']==429):
#             print("Daily Limit Reached")
#             break
            
#         else:
#             print("Failed Read")
#             missed.append([row['id'],row['name']])
#             print(row['id'],row['name'])

# with open("footfall.json",'a') as file:    
#     JSON.dump(newFootfall,file)

Success: Dan Marino's Geriatric Clubhouse
Count:446


Since Foursquare disabled viewing Total chekins at a place,the popularity of a place is analyzed using a **combination of the Likes, Photos and the Tips** a particular place has.

In [636]:
with open("footfall.json",'r') as file:    
    footfall=JSON.load(file)
footfall=pd.DataFrame(footfall)
footfall.drop_duplicates(['name'],inplace=True)
Landmarks.drop_duplicates(['name'],inplace=True)
footfall.sort_values(['tips','photos'],ascending=False)
# footfall.reset_index(drop=True,inplace=True)
Landmarks=Landmarks.merge(footfall,on='id',suffixes=('', '_y')).drop(['lat_y','long_y','name_y'],axis=1)
Landmarks['popularity']=Landmarks[['tips','photos','likes']].sum(axis=1)

Used the popularity Value and the actual number of visitors to place and constructed a regression line externally. Every Place is considered to have atleast 15000-30000 visitors annualy and every small landmark is assumed to have a popularity of 0. Though the method used is arbitrary, it can give a fair indicator of the landmarks and their popularity in an area.

Every 30,000 visitors to a place is assumed to increase the popularity factor of a place by 1.

The regression line used is: **y=167.48*x+30000**

In [686]:
Landmarks['culturalFactor']=Landmarks['popularity'].apply(lambda x:(167.5*x+30000)/30000)
Landmarks.sort_values('culturalFactor',ascending=False)
culturalFactors=Landmarks.groupby('code').sum()[['culturalFactor']].reset_index()
culturalFactors.head()

Unnamed: 0,code,culturalFactor
0,BK09,7.875417
1,BK17,9.10875
2,BK21,19.700583
3,BK25,1.011167
4,BK26,1.0335


The presence of Landmarks and other cultural loactions is condesnsed into a single factor referred here as **cultural factor**. 
Though it is a completely arbitrary factor, it gives a comparative index for comparing locations.It will be good only as a comparative indicator which is optimal for our purpose 

In [725]:
tmp=pd.DataFrame(smallLandmarks.groupby('code')['category'].value_counts())
tmp=tmp.rename({'category':'count'},axis=1).reset_index()
piv=tmp.pivot(index='code',columns='category',values='count').fillna(0).reset_index()
piv.index.name=None
piv.columns.name=None
piv['factor']=piv['Art Gallery']*1.25+piv['Historic Site']*1.5+piv['Music Venue']*1.25+piv['Performing Arts Venue']
piv=piv.merge(culturalFactors,on='code',how='left')
piv['culturalFactor'].replace(np.NaN,0.0,inplace=True)
piv['culturalFactor']=piv['culturalFactor']+piv['factor']
result=piv.loc[:,['code','culturalFactor']]
result.head()

Unnamed: 0,code,culturalFactor
0,BK09,24.125417
1,BK17,16.60875
2,BK19,3.0
3,BK21,38.950583
4,BK25,3.261167


The cultural Factor Data is merged with the orginal Data and it concludes the Data collection and Preparation Stage

In [4]:
DF=gp.read_file("Data/updatedData.shp")
# DF=DF.merge(result,on='code',how='left')
# DF['culturalFactor'].replace(np.NaN,0.0,inplace=True)
# DF.to_file("Data/preparedData.shp")