In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
% matplotlib inline

In [2]:
#cargo ficheros de ejecuciones anteriores
df = pd.read_csv(os.path.join('./data/Serie_Total2016.csv'))
coord = pd.read_csv(os.path.join('./data/Coord_EK.csv'))

In [3]:
#Eva: descarga de ficheros
import os
import urllib.request

def dl_data(url, output):
    try:        
        f = urllib.request.urlopen(url)
        print ("Downloading " + url)

        os.makedirs(os.path.dirname(output), exist_ok=True)
        with open(output, "wb") as local_file:
            local_file.write(f.read())

    except URLError:
        print ("Error", url)

## Puntos culturales

In [4]:
#descarga de lugares culturales de Seattle
url = "https://data.seattle.gov/api/views/vsxr-aydq/rows.csv?accessType=DOWNLOAD"
output1 = './data/Seattle_Cultural_Space_Inventory.csv'
dl_data(url,output1)

Downloading https://data.seattle.gov/api/views/vsxr-aydq/rows.csv?accessType=DOWNLOAD


In [5]:
# cargo puntos culturales de Seattle
dfpois = pd.read_csv(os.path.join('./data/Seattle_Cultural_Space_Inventory.csv'))
# me quedo con la localización
dfpois = dfpois[['Location']]
dfpois.head()

Unnamed: 0,Location
0,"(47.658562, -122.313115)"
1,"(47.601458, -122.330209)"
2,"(47.556461, -122.268508)"
3,"(47.551722, -122.278061)"
4,"(47.556793, -122.283927)"


In [6]:
# separo localización en coordenadas
dfpois = dfpois['Location'].str.strip('()')   \
                   .str.split(', ', expand=True)   \
                   .rename(columns={0:'latitude', 1:'longitude'}) 
dfpois.head()

Unnamed: 0,latitude,longitude
0,47.658562,-122.313115
1,47.601458,-122.330209
2,47.556461,-122.268508
3,47.551722,-122.278061
4,47.556793,-122.283927


In [7]:
# esta función es para calcular la distancia entre dis puntos pero no la EUCLIDEA sino teniendo en cuenta la curvatura de la
# tierra.
from math import radians, cos, sin, asin, sqrt, acos

def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6373 # Radius of earth kilometers
    return c * r

In [8]:
# voy a crear una lista de element_key con algun punto cercano (<=300 metros)
# una vez que un element_key tiene un punto cercano ya no miro si tiene otros (break)
listad = []
for index2, row2 in coord.iterrows():
    ek = row2['element_key']
    lat2 = pd.to_numeric(row2['latitude'])
    lon2 = pd.to_numeric(row2['longitude'])
    for index, row in dfpois.iterrows():
        lat1 = pd.to_numeric(row['latitude'])
        lon1 = pd.to_numeric(row['longitude'])
        dist = haversine(lon1, lat1, lon2, lat2)
        if dist <= 0.3:
            listad.append([ek,1])
            break

In [9]:
# creo in dataframe temporal con la lista anterior
pdx = pd.DataFrame(listad, columns = ['element_key','cult_poi'])
pdx.element_key = pdx.element_key.astype(int)
pdx.cult_poi = pdx.cult_poi.astype(int)
pdx.head()


Unnamed: 0,element_key,cult_poi
0,1001,1
1,1002,1
2,1005,1
3,1006,1
4,1009,1


In [10]:
pdx.head()

Unnamed: 0,element_key,cult_poi
0,1001,1
1,1002,1
2,1005,1
3,1006,1
4,1009,1


In [11]:
# al final hago un merge con el dataset de transacciones
df_final = pd.merge(df, pdx, how='left', on='element_key')

In [12]:
# los que no tienen poi los marco a 0
df_final.fillna(0, inplace=True)
df_final.cult_poi = df_final.cult_poi.astype(int)
df_final.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,occupation_perc,prcp,tmax,tmin,cult_poi
0,35693,47.619158,-122.346457,2016-01-02 00:00:00,28.57,0.0,42,25,1
1,53549,47.628175,-122.341132,2016-01-02 00:00:00,3.12,0.0,42,25,0
2,11881,47.619156,-122.333107,2016-01-02 02:00:00,10.0,0.0,42,25,1
3,9393,47.621441,-122.33597,2016-01-02 03:00:00,20.0,0.0,42,25,1
4,11133,47.619815,-122.348131,2016-01-02 04:00:00,20.0,0.0,42,25,1


In [13]:
df_final.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,occupation_perc,prcp,tmax,tmin,cult_poi
0,35693,47.619158,-122.346457,2016-01-02 00:00:00,28.57,0.0,42,25,1
1,53549,47.628175,-122.341132,2016-01-02 00:00:00,3.12,0.0,42,25,0
2,11881,47.619156,-122.333107,2016-01-02 02:00:00,10.0,0.0,42,25,1
3,9393,47.621441,-122.33597,2016-01-02 03:00:00,20.0,0.0,42,25,1
4,11133,47.619815,-122.348131,2016-01-02 04:00:00,20.0,0.0,42,25,1


In [14]:
df_final.to_csv('./data/Serie_Total2016_cult_pois.csv')

## Baseball

In [17]:
# instalar este paquete
# conda install -c scitools/label/archive shapely
from shapely.geometry import Point
from shapely.geometry.multipolygon import MultiPolygon
from shapely import wkt
from shapely.wkt import loads

In [18]:
url = "https://data.seattle.gov/api/views/6v75-vrvs/rows.csv?accessType=DOWNLOAD"
output1 = './data/Baseball_Field.csv'
dl_data(url,output1)

Downloading https://data.seattle.gov/api/views/6v75-vrvs/rows.csv?accessType=DOWNLOAD


In [19]:
dfpois = pd.read_csv(os.path.join('./data/Baseball_Field.csv'))

In [20]:
dfpois = dfpois[['the_geom']]
dfpois.head()

Unnamed: 0,the_geom
0,MULTIPOLYGON (((-122.27259129399673 47.5260192...
1,MULTIPOLYGON (((-122.3019285062559 47.66868012...
2,MULTIPOLYGON (((-122.31492060232524 47.5861955...
3,MULTIPOLYGON (((-122.34217620405622 47.6687435...
4,MULTIPOLYGON (((-122.32555601354544 47.7200755...


In [21]:
# la estructura MULTIPOLYGON consta de varios poligonos y estos de puntos
# por cada element_key voy a buscar si al menos tiene un punto de un poligono cercano
# para que no dure mucho la ejecución tomo el primer POINT del POLYGON

listad = []
for index2, row2 in coord.iterrows():
    ek = row2['element_key']
    lat2 = pd.to_numeric(row2['latitude'])
    lon2 = pd.to_numeric(row2['longitude'])
    for index, row in dfpois.iterrows():
        multi = loads(row['the_geom'])
        polygons = list(multi)
        for p in polygons:
            puntos = p.exterior.coords
            p = puntos[0]
            lat1 = pd.to_numeric(p[1])
            lon1 = pd.to_numeric(p[0])
            dist = haversine(lon1, lat1, lon2, lat2)
            if dist <= 0.3:
                listad.append([ek,1])
                break

In [22]:
# creo in dataframe temporal con la lista anterior
pdx = pd.DataFrame(listad, columns = ['element_key','baseball_poi'])
pdx.element_key = pdx.element_key.astype(int)
pdx.baseball_poi = pdx.baseball_poi.astype(int)
pdx.head()

Unnamed: 0,element_key,baseball_poi
0,1233,1
1,1233,1
2,1234,1
3,1234,1
4,1433,1


In [23]:
pdx.drop_duplicates(subset=['element_key'], keep='last', inplace=True)

In [24]:
df.shape, pdx.shape

((4305622, 8), (86, 2))

In [25]:
df_final = pd.merge(df_final, pdx, on='element_key', how='left')

In [26]:
# los que no tienen poi los marco a 0
df_final.fillna(0, inplace=True)
df_final.baseball_poi = df_final.baseball_poi.astype(int)
df_final.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,occupation_perc,prcp,tmax,tmin,cult_poi,baseball_poi
0,35693,47.619158,-122.346457,2016-01-02 00:00:00,28.57,0.0,42,25,1,0
1,53549,47.628175,-122.341132,2016-01-02 00:00:00,3.12,0.0,42,25,0,0
2,11881,47.619156,-122.333107,2016-01-02 02:00:00,10.0,0.0,42,25,1,0
3,9393,47.621441,-122.33597,2016-01-02 03:00:00,20.0,0.0,42,25,1,0
4,11133,47.619815,-122.348131,2016-01-02 04:00:00,20.0,0.0,42,25,1,0


In [27]:
df_final.to_csv('./data/Serie_Total2016_cult_sport_1.csv')

## Tenis

In [28]:
# Repito lo mismo para tenis

In [29]:
url = "https://data.seattle.gov/api/views/agb7-rh9h/rows.csv?accessType=DOWNLOAD"
output1 = './data/Tennis_Court_Outline.csv'
dl_data(url,output1)

Downloading https://data.seattle.gov/api/views/agb7-rh9h/rows.csv?accessType=DOWNLOAD


In [30]:
dfpois = pd.read_csv(os.path.join('./data/Tennis_Court_Outline.csv'))

In [31]:
dfpois = dfpois[['the_geom']]
dfpois.head()

Unnamed: 0,the_geom
0,MULTIPOLYGON (((-122.35547328361099 47.6311029...
1,MULTIPOLYGON (((-122.35529321983168 47.6315188...
2,MULTIPOLYGON (((-122.30451313039816 47.6764982...
3,MULTIPOLYGON (((-122.34329665395495 47.6695318...
4,MULTIPOLYGON (((-122.34330219360075 47.6688704...


In [32]:
listad = []
for index2, row2 in coord.iterrows():
    ek = row2['element_key']
    lat2 = pd.to_numeric(row2['latitude'])
    lon2 = pd.to_numeric(row2['longitude'])
    for index, row in dfpois.iterrows():
        multi = loads(row['the_geom'])
        polygons = list(multi)
        for p in polygons:
            puntos = p.exterior.coords
            p = puntos[0]
            lat1 = pd.to_numeric(p[1])
            lon1 = pd.to_numeric(p[0])
            dist = haversine(lon1, lat1, lon2, lat2)
            if dist <= 0.3:
                listad.append([ek,1])
                break

In [33]:
# creo in dataframe temporal con la lista anterior
pdx = pd.DataFrame(listad, columns = ['element_key','tennis_poi'])
pdx.element_key = pdx.element_key.astype(int)
pdx.tennis_poi = pdx.tennis_poi.astype(int)
pdx.head()

Unnamed: 0,element_key,tennis_poi
0,1233,1
1,1234,1
2,1433,1
3,1433,1
4,1589,1


In [34]:
df_final = pd.merge(df_final, pdx, on='element_key', how='left')

In [35]:
# los que no tienen poi los marco a 0
df_final.fillna(0, inplace=True)
df_final.tennis_poi = df_final.tennis_poi.astype(int)
df_final.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,occupation_perc,prcp,tmax,tmin,cult_poi,baseball_poi,tennis_poi
0,35693,47.619158,-122.346457,2016-01-02 00:00:00,28.57,0.0,42,25,1,0,0
1,53549,47.628175,-122.341132,2016-01-02 00:00:00,3.12,0.0,42,25,0,0,0
2,11881,47.619156,-122.333107,2016-01-02 02:00:00,10.0,0.0,42,25,1,0,0
3,9393,47.621441,-122.33597,2016-01-02 03:00:00,20.0,0.0,42,25,1,0,0
4,11133,47.619815,-122.348131,2016-01-02 04:00:00,20.0,0.0,42,25,1,0,0


In [36]:
df_final.to_csv('./data/Serie_Total2016_cult_sport_2.csv')