# Foursquare POIs pre-processing


This notebook creates the mapping between Foursquare POIS and the grid for each city. It also counts the number of POIs for each cell.

### Setup

In [16]:
%matplotlib inline

import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, Polygon
import os
import re
import zipfile,fnmatch
import matplotlib.pyplot as plt
import pathlib
import sys

In [17]:
sys.path.append(os.path.abspath("GeoL/"))
from geol.utils import utils

In [18]:
# SET CITY NAME
CITIES = ["barcelona", "london","paris","milan","amsterdam","lisbon"]
CITY_NAME= CITIES[0]

In [13]:
tessellation_sizes = [50,100,200,250]
SIZE = 200

In [19]:
# Base directory
BASE_DIR = os.path.abspath(".")
# base directory for data files
BASE_DIR_DATA = os.path.join(BASE_DIR, "data")
# Base dir for a city data
BASE_DIR_CITY = os.path.join(BASE_DIR_DATA , CITY_NAME)
# Base dir for the city tesselation/grid
BASE_DIR_TESSELLATION = os.path.join(BASE_DIR_CITY, "tessellation")
# Empty tesselation/grid 
TESSELLATION = os.path.join(BASE_DIR_TESSELLATION, "tessellation_"+CITY_NAME +"-Square-"+ str(SIZE) +".geojson")
# Base dir landuse from Urban Atlas
BASE_DIR_LANDUSE = os.path.join(BASE_DIR_CITY, 'landuse')
# Base dir landuse FourSquare
BASE_DIR_FOURSQUARE = os.path.join(BASE_DIR_CITY, 'foursquare_raw')
# City shape fron Nominatim API
OUTPUT_CITY_SHAPE = os.path.join(BASE_DIR_CITY,'osm_raw', CITY_NAME + ".geojson")
#  POIs in Polygon(city shape), output of the clipping operation
OUTPUT_POIS = os.path.join(BASE_DIR_CITY, CITY_NAME +"_POIs.csv")
# Foursquare raw data (formerly known as FOURSQUARE_GRID)
FOURSQUARE_RAW_DATA = os.path.join(BASE_DIR_FOURSQUARE, CITY_NAME +"_poi.csv")
# Foursquare mapped to the grid/tessellation
FOURSQUARE_TESSELLATION = os.path.join(BASE_DIR_CITY, 'mapped', CITY_NAME +"_fs_tessellation_"+ str(SIZE) +".csv")
# Count of Foursquare features per each cell in the tessellation
FOURSQUARE_COUNT = os.path.join(BASE_DIR_CITY, 'count', CITY_NAME +"_fs_count_"+ str(SIZE) +".csv")
# Count of Urban Atlas features per each cell in the tessellation
UA_COUNT = os.path.join(BASE_DIR_CITY, 'count', CITY_NAME +"_ua_count_"+ str(SIZE) +".csv")


In [20]:
def readShapefile(d):
    p = pathlib.Path(d)
    for sub_path in p.iterdir():
        if sub_path.is_dir():
            for sub_sub_path in sub_path.iterdir():
                if sub_sub_path.is_dir() and sub_sub_path.name== "Shapefiles":
                    for fileName in sub_sub_path.iterdir():
                        if fileName.name.endswith(".shp") and not fileName.name.startswith('Boundary'):
                            return str(fileName.absolute())


# Urban Atlas landuse
UA_LANDUSE = readShapefile(BASE_DIR_LANDUSE)

### Prepare directories

In [6]:
directoriesToBuild = ['landuse','clipped','count','foursquare_raw','mapped','osm_raw','tessellation','test','train']
# build folder structure for each city
def makeDirStruct(city):
    for directory in directoriesToBuild:
            os.makedirs(os.path.join(BASE_DIR_DATA, city, directory), exist_ok=True)
# unzip the raw shapefiles           
    unzipCityShapeFiles(city)

In [5]:
# unzip file in the appropriate city folder
def unzipCityShapeFiles(city):
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, city)
    for fileName in os.listdir(BASE_DIR_DATA):
        if city == fileName.split("_")[0] and re.search('zip',fileName):
            zipfile.ZipFile(os.path.join(BASE_DIR_DATA,fileName)).extractall(os.path.join(BASE_DIR_CITY,'osm_raw'))


In [40]:
# create directories and unzip files
for CITY_NAME in CITIES:
    makeDirStruct(CITY_NAME)
            

### Get City Shapes

In [11]:
# Utility function:
# recast the columns of boolean type over to integer
# so FIONA can save the GeoDataFrame...
def gdf_bool_to_int(gdf):
    """For a given GeoDataFrame, returns a copy that
    recasts all `bool`-type columns as `int`.

    GeoDataFrame -> GeoDataFrame"""
    df = gdf.copy()
    coltypes = gpd.io.file.infer_schema(df)['properties']
    for c in coltypes.items():
        if c[1] == 'bool':
            colname = c[0]
            df[colname] = df[colname].astype('int')
    return df

In [12]:
for CITY_NAME in CITIES:
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, CITY_NAME)    
    OUTPUT_CITY_SHAPE = os.path.join(BASE_DIR_CITY,'osm_raw', CITY_NAME + ".geojson")    
    city_shape = utils.get_area_boundary(CITY_NAME,2) # recast the columns of boolean type over to integer
    try: 
        os.remove(OUTPUT_CITY_SHAPE)
    except OSError:
        pass
    gdf_bool_to_int(city_shape).to_file(OUTPUT_CITY_SHAPE, driver="GeoJSON",encoding = 'utf-8')

### Create tessellations

In [3]:
for CITY_NAME in CITIES:
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, CITY_NAME)    
    OUTPUT_CITY_SHAPE = os.path.join(BASE_DIR_CITY,'osm_raw', CITY_NAME + ".geojson")    
    BASE_DIR_TESSELLATION = os.path.join(BASE_DIR_CITY, "tessellation")
    for SIZE in tessellation_sizes:
        print (CITY_NAME, OUTPUT_CITY_SHAPE, BASE_DIR_TESSELLATION, SIZE)
        %run GeoL/create_grid.py -a $CITY_NAME -b $OUTPUT_CITY_SHAPE -o $BASE_DIR_TESSELLATION  -v 2 -s $SIZE -m -t square

### Remove duplicate records

In [None]:
# REMOVE DUPLICATES AND OVERWRITE THE DATA
for CITY_NAME in CITIES:
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, CITY_NAME)    
    BASE_DIR_TESSELLATION = os.path.join(BASE_DIR_CITY, "tessellation")
    # input raw Foursquare data
    FOURSQUARE_RAW_DATA = os.path.join(BASE_DIR_CITY, 'foursquare_raw', CITY_NAME +"_poi.csv")
    data = pd.read_csv(FOURSQUARE_RAW_DATA)
    data.drop_duplicates(['name','latitude','longitude'],inplace=True)
    data.to_csv(FOURSQUARE_RAW_DATA, index=False, sep=',')

### Map tesselation and POIs

In [22]:
# FOR MULTIPLE CITIES AT ONCE
for CITY_NAME in CITIES:
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, CITY_NAME)    
    OUTPUT_CITY_SHAPE = os.path.join(BASE_DIR_CITY,'osm_raw', CITY_NAME + ".geojson")    
    BASE_DIR_TESSELLATION = os.path.join(BASE_DIR_CITY, "tessellation")
    # input raw Foursquare data
    FOURSQUARE_RAW_DATA = os.path.join(BASE_DIR_CITY, 'foursquare_raw', CITY_NAME +"_poi.csv")
    for INPUT_TESSELLATION in os.listdir(BASE_DIR_TESSELLATION):
        inputDir = os.path.join(BASE_DIR_TESSELLATION,INPUT_TESSELLATION)
        outputSize = inputDir.split('-')[-1].split('.')[0]
        # output Foursquare data mapped to the grid
        FOURSQUARE_TESSELLATION = os.path.join(BASE_DIR_CITY, 'mapped', CITY_NAME +"_fs_tessellation_"+ outputSize +".csv")
        print(inputDir )
        print(FOURSQUARE_RAW_DATA)
        print(FOURSQUARE_TESSELLATION)
        print("***********************")
        %run GeoL/mapping.py -g $inputDir -d $FOURSQUARE_RAW_DATA -o $FOURSQUARE_TESSELLATION -n "latitude longitude"

### Count POIs

In [2]:
# FOR MULTIPLE CITIES AT ONCE
for CITY_NAME in CITIES:
    BASE_DIR_CITY =  os.path.join(BASE_DIR_DATA, CITY_NAME)    
    BASE_DIR_TESSELLATION = os.path.join(BASE_DIR_CITY, "tessellation")
    for INPUT_TESSELLATION in os.listdir(BASE_DIR_TESSELLATION):
        inputDir = os.path.join(BASE_DIR_TESSELLATION,INPUT_TESSELLATION)
        outputSize = inputDir.split('-')[-1].split('.')[0]
        # output Foursquare data mapped to the grid
        FOURSQUARE_TESSELLATION = os.path.join(BASE_DIR_CITY, 'mapped', CITY_NAME +"_fs_tessellation_"+ outputSize +".csv")
        FOURSQUARE_COUNT = os.path.join(BASE_DIR_CITY, 'count', CITY_NAME +"_fs_count_"+ outputSize +".csv")

        print(FOURSQUARE_TESSELLATION)
        print(FOURSQUARE_COUNT)
        print("***********************")
        %run -i GeoL/poi_number.py -m $FOURSQUARE_TESSELLATION -l 0 -o $FOURSQUARE_COUNT