# Table of Contents
<p>
<div class="lev1 toc-item">
    <a href="#Cargamos-transacciones-de-2016" data-toc-modified-id="Cargamos-transacciones-de-2016">
        <span class="toc-item-num">1&nbsp;&nbsp;</span>
        Cargamos transacciones de 2016</a></div>

<div class="lev2 toc-item">
    <a href="#Limpieza-de-transacciones" data-toc-modified-id="Limpieza-de-transacciones">
    <span class="toc-item-num">1.1&nbsp;&nbsp;</span>
    Limpieza de transacciones</a></div>

<div class="lev1 toc-item">
    <a href="#Cargamos-datos-de-ubicación-geográfica" data-toc-modified-id="Cargamos-datos-de-ubicación-geográfica">
    <span class="toc-item-num">2&nbsp;&nbsp;</span>
    Cargamos datos de ubicación geográfica</a></div>
    
<div class="lev1 toc-item">
    <a href="#Cargamos-datos-de-capacidad-de-plazas" data-toc-modified-id="Cargamos-datos-de-capacidad-de-plazas">
    <span class="toc-item-num">3&nbsp;&nbsp;</span>
    Cargamos datos de capacidad de plazas</a></div>

<div class="lev1 toc-item">
    <a href="#Relacionamos-transacciones,-ubicaciones-y-capacidad" data-toc-modified-id="Relacionamos-transacciones,-ubicaciones-y-capacidad">
    <span class="toc-item-num">4&nbsp;&nbsp;</span>
    Relacionamos transacciones, ubicaciones y capacidad</a></div>
          
<div class="lev1 toc-item">
    <a href="#Construimos-la-serie-de-ocupación-por-timestamp" data-toc-modified-id="Construimos-la-serie-de-ocupación-por-timestamp">
    <span class="toc-item-num">5&nbsp;&nbsp;</span>
    Construimos la serie de ocupación por timestamp</a></div>
    
<div class="lev1 toc-item">
    <a href="#Completamos-la-serie-con-datos-meteorológicos" data-toc-modified-id="Completamos-la-serie-con-datos-meteorológicos">
    <span class="toc-item-num">6&nbsp;&nbsp;</span>
    Completamos la serie con datos meteorológicos</a></div>

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
% matplotlib inline

## Cargamos transacciones de 2016

In [2]:
df = pd.read_csv(os.path.join('./data/ParkingTransaction_2016_cleaned.csv'), 
                           parse_dates=['TransactionDateTime'])

In [3]:
df['TransactionDateTime'].describe()

count                10935395
unique                5505469
top       2016-04-08 12:03:59
freq                       63
first     2016-01-01 11:49:15
last      2016-12-31 20:00:51
Name: TransactionDateTime, dtype: object

In [3]:
import re

_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')

def camelToSnake(s):
    subbed = _underscorer1.sub(r'\1_\2', s)
    return _underscorer2.sub(r'\1_\2', subbed).lower()

In [4]:
df.columns = df.columns.map(lambda x: camelToSnake(x))

df.transaction_date = pd.to_datetime(df.transaction_date, format="%Y-%m-%d")
df.sort_values('transaction_date_time', inplace=True)

# creamos nueva columna 'final_date_time'
df['final_date_time'] = df.transaction_date_time.add(pd.to_timedelta(df.duration_mins, unit="m"))

df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10935395 entries, 8 to 10912795
Data columns (total 12 columns):
unnamed: 0               int64
transaction_id           int64
transaction_date_time    datetime64[ns]
transaction_date         datetime64[ns]
time_start               object
time_expired             object
duration_mins            int64
amount                   float64
payment_mean             object
meter_code               int64
element_key              int64
final_date_time          datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(5), object(3)
memory usage: 2.8 GB


In [5]:
df = df.drop(columns=['unnamed: 0','amount','payment_mean','meter_code'])

In [6]:
df.head()

Unnamed: 0,transaction_id,transaction_date_time,transaction_date,time_start,time_expired,duration_mins,element_key,final_date_time
8,183506786,2016-01-01 11:49:15,2016-01-01,11:49,11:49,0,54730,2016-01-01 11:49:15
4,183511748,2016-01-01 12:36:04,2016-01-01,12:36,12:36,0,88773,2016-01-01 12:36:04
17,183511753,2016-01-01 12:36:08,2016-01-01,12:36,12:36,0,88773,2016-01-01 12:36:08
15,183529739,2016-01-01 12:40:30,2016-01-01,12:40,12:40,0,57354,2016-01-01 12:40:30
12,183527118,2016-01-01 14:52:45,2016-01-01,14:52,14:52,0,76102,2016-01-01 14:52:45


### Limpieza de transacciones

In [7]:
df.transaction_id.duplicated().sum()  # no hay transacciones duplicadas

0

In [8]:
df.isnull().sum()  # no hay valores nulos

transaction_id           0
transaction_date_time    0
transaction_date         0
time_start               0
time_expired             0
duration_mins            0
element_key              0
final_date_time          0
dtype: int64

#### Duración incorrecta

In [9]:
# Observamos algunas transacciones con duración = 0
df.duration_mins.loc[df.duration_mins == 0].count()

9086

In [10]:
# Incluso hay algunas transacciones con duración negativa
df.duration_mins.loc[df.duration_mins < 0].count()

1118

In [13]:
# Eliminamos ambos casos que suponen menos de un 0.1% del total
df = df.loc[df.duration_mins > 0]

#### Domingos o festivos

In [11]:
# Observamos algunas transacciones realizadas por error en domingo
df.transaction_date_time.loc[df.transaction_date_time.dt.weekday == 6].count()

2762

In [12]:
# Las eliminamos también (suponen menos de un 0.03% del total)
df = df.loc[df.transaction_date_time.dt.weekday != 6]

In [13]:
# Y observamos transacciones en días festivos
import holidays

hol = holidays.US(state='WA', years=[2016]).items()
hol_dates = []
for dat, name in sorted(hol):
    hol_dates.append(dat)

df.transaction_date.loc[df.transaction_date.isin(hol_dates)].count()

34108

In [14]:
# Las eliminamos también (suponen un 0.3% del total)
df = df.loc[~df.transaction_date.isin(hol_dates)]

#### Horario de funcionamiento de los parquímetros

In [15]:
# Encontramos también transacciones que están fuera del rango horario de uso de los parquímetros (de 8 a 20h)
df.element_key.loc[((df.time_start > '20:00') | (df.time_start < '08:00')) & 
       ((df.time_expired > '20:00') | (df.time_expired < '08:00'))].count()

70503

In [17]:
# También las eliminamos, reduciendo el dataset en menos de un 0.7%
indexes = df.loc[((df.time_start > '20:00') | (df.time_start < '08:00')) & 
       ((df.time_expired > '20:00') | (df.time_expired < '08:00'))].index.get_values()
df = df.drop(index=indexes, axis = 1)

In [18]:
df.shape  # con la limpieza hemos reducido el tamaño del dataset inicial en casi un 1%.

(10828022, 8)

## Cargamos datos de ubicación geográfica

In [21]:
coord = pd.read_csv(os.path.join('./data/Coord_EK.csv'))

In [22]:
coord.head()

Unnamed: 0,element_key,latitude,longitude
0,1001,47.602862,-122.334703
1,1002,47.602997,-122.334538
2,1005,47.603602,-122.335382
3,1006,47.603725,-122.335171
4,1009,47.60501,-122.336669


In [23]:
coord.element_key.duplicated().sum()

0

## Cargamos datos de capacidad de plazas

In [24]:
blocks = pd.read_csv(os.path.join('./data/Blockface_cleaned.csv'))

In [25]:
blocks.columns = blocks.columns.map(lambda x: camelToSnake(x))

In [26]:
blocks[blocks.element_key == 1001]

Unnamed: 0,pay_station_blockface_id,element_key,parking_spaces,paid_parking_area,parking_time_limit_category,peak_hour_start1,peak_hour_end1,peak_hour_start2,peak_hour_end2,paid_area_start_time,...,saturday_start1,saturday_end1,saturday_rate2,saturday_start2,saturday_end2,saturday_rate3,saturday_start3,saturday_end3,start_time_saturday,end_time_saturday
469,7576,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.0,11:00:00,18:00:00,,,,08:00:00,18:00:00
2783,10071,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
3017,10262,1001,4.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
4648,11976,1001,4.0,,120.0,06:00:00,09:00:00,15:00:00,18:00:00,,...,00:00:00,00:00:00,0.0,00:00:00,00:00:00,0.0,00:00:00,00:00:00,,
5989,13670,1001,4.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
6301,14348,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,5.0,11:00:00,18:00:00,,,,08:00:00,18:00:00
7077,13939,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
8059,3405,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,,,,,,,,,,
10576,1352,1001,7.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,,,,,,,,,,
11332,6046,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,3.5,11:00:00,18:00:00,,,,08:00:00,18:00:00


Como hay algunos casos como el anterior donde hay divergencia en la información de plazas asociadas a un mismo element_key, calculamos la media redondeada a un valor entero de los distintos valores existentes.

In [27]:
park_spaces = blocks.groupby('element_key')['parking_spaces'].mean()

In [28]:
park_spaces = park_spaces.reset_index(level=['element_key'])

In [29]:
park_spaces.parking_spaces.loc[park_spaces.parking_spaces.isnull()]

141   NaN
Name: parking_spaces, dtype: float64

In [30]:
park_spaces = park_spaces.dropna()  # eliminamos un valor nulo

In [31]:
park_spaces.parking_spaces = np.rint(park_spaces.parking_spaces).astype(int)

In [32]:
# park_spaces.to_csv('./data/Parking_Spaces.csv', index=False)

In [32]:
park_spaces.element_key.duplicated().sum()

0

In [33]:
park_spaces.head()

Unnamed: 0,element_key,parking_spaces
0,1001,5
1,1002,9
2,1005,5
3,1006,5
4,1009,5


## Relacionamos transacciones, ubicaciones y capacidad

In [34]:
df.element_key.unique().size

1514

In [35]:
coord.element_key.unique().size

1517

In [36]:
park_spaces.element_key.unique().size

1707

In [37]:
len(set(coord.element_key).intersection(set(df.element_key)))

1445

In [38]:
len(set(park_spaces.element_key).intersection(set(df.element_key)))

1513

**Mezclamos los datasets:**

In [39]:
# Transacciones con Coordenadas
df_c = pd.merge(df, coord, on='element_key', how='inner', validate='many_to_one')

In [40]:
# Transacciones con Coordenadas y con Capacidad de Plazas disponibles
df_cp = pd.merge(df_c, park_spaces, on='element_key', how='inner', validate='many_to_one')

In [41]:
df.shape, df_c.shape, df_cp.shape

((10828022, 8), (10664529, 10), (10664529, 11))

In [42]:
df_cp.transaction_id.duplicated().sum()  # comprobamos que no se han generado duplicados

0

In [43]:
df_cp.set_index('transaction_id', inplace=True)
df_cp.head()

Unnamed: 0_level_0,transaction_date_time,transaction_date,time_start,time_expired,duration_mins,element_key,final_date_time,latitude,longitude,parking_spaces
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
64059668,2016-01-02 00:19:07,2016-01-02,00:19,10:17,598,35693,2016-01-02 10:17:07,47.619158,-122.346457,7
64059669,2016-01-02 00:21:55,2016-01-02,00:21,10:19,598,35693,2016-01-02 10:19:55,47.619158,-122.346457,7
64084672,2016-01-02 07:52:55,2016-01-02,07:52,14:02,370,35693,2016-01-02 14:02:55,47.619158,-122.346457,7
183563751,2016-01-02 08:10:02,2016-01-02,08:10,09:10,60,35693,2016-01-02 09:10:02,47.619158,-122.346457,7
64084673,2016-01-02 08:22:59,2016-01-02,08:22,10:22,120,35693,2016-01-02 10:22:59,47.619158,-122.346457,7


## Construimos la serie de ocupación por timestamp

Construimos la tabla corta primero que usaremos para montar las series temporales:

In [51]:
in_cols = ['element_key', 'latitude', 'longitude', 'transaction_date_time', 'parking_spaces']
out_cols = ['element_key', 'latitude', 'longitude', 'final_date_time', 'parking_spaces']

df_in = df_cp[in_cols].reset_index()
df_in.transaction_id = df_in.transaction_id.map(lambda s:'%s_in' % str(s))
df_in['timestamp_sign'] = 1.0
df_in.rename(columns={'transaction_date_time': 'timestamp'}, inplace=True)

df_out = df_cp[out_cols].reset_index()
df_out.transaction_id = df_out.transaction_id.map(lambda s:'%s_out' % str(s))
df_out['timestamp_sign'] = - 1.0
df_out.rename(columns={'final_date_time': 'timestamp'}, inplace=True)  # inplace = True, value of copy is ignored

In [52]:
df_in.head()

Unnamed: 0,transaction_id,element_key,latitude,longitude,timestamp,parking_spaces,timestamp_sign
0,64059668_in,35693,47.619158,-122.346457,2016-01-02 00:19:07,7,1.0
1,64059669_in,35693,47.619158,-122.346457,2016-01-02 00:21:55,7,1.0
2,64084672_in,35693,47.619158,-122.346457,2016-01-02 07:52:55,7,1.0
3,183563751_in,35693,47.619158,-122.346457,2016-01-02 08:10:02,7,1.0
4,64084673_in,35693,47.619158,-122.346457,2016-01-02 08:22:59,7,1.0


In [53]:
df_out.head()

Unnamed: 0,transaction_id,element_key,latitude,longitude,timestamp,parking_spaces,timestamp_sign
0,64059668_out,35693,47.619158,-122.346457,2016-01-02 10:17:07,7,-1.0
1,64059669_out,35693,47.619158,-122.346457,2016-01-02 10:19:55,7,-1.0
2,64084672_out,35693,47.619158,-122.346457,2016-01-02 14:02:55,7,-1.0
3,183563751_out,35693,47.619158,-122.346457,2016-01-02 09:10:02,7,-1.0
4,64084673_out,35693,47.619158,-122.346457,2016-01-02 10:22:59,7,-1.0


In [54]:
full_transactions = pd.concat([df_in, df_out])
full_transactions.set_index('transaction_id', inplace=True)
full_transactions.sort_values('timestamp', ascending=True, inplace=True)

In [55]:
full_transactions.head()

Unnamed: 0_level_0,element_key,latitude,longitude,timestamp,parking_spaces,timestamp_sign
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
64059668_in,35693,47.619158,-122.346457,2016-01-02 00:19:07,7,1.0
64059669_in,35693,47.619158,-122.346457,2016-01-02 00:21:55,7,1.0
64059649_in,53549,47.628175,-122.341132,2016-01-02 00:46:40,32,1.0
64059636_in,11881,47.619156,-122.333107,2016-01-02 02:56:06,10,1.0
64059641_in,9393,47.621441,-122.33597,2016-01-02 03:14:33,5,1.0


In [57]:
df_in.shape, df_out.shape, full_transactions.shape

((10664529, 7), (10664529, 7), (21329058, 6))

#### Añadimos porcentaje de ocupación

In [58]:
# COMANDO A REVISAR OPTIMIZACIÓN - LE HA COSTADO
full_transactions.timestamp = full_transactions.timestamp.map(lambda x: x.replace(microsecond=0,second=0,minute=0))

In [59]:
# COMANDO A REVISAR OPTIMIZACIÓN - LE HA COSTADO
full_transactions['occupation'] = full_transactions.groupby(['element_key', 'timestamp']).timestamp_sign.transform(
    lambda series: series.cumsum())

In [60]:
full_transactions = full_transactions.reset_index(drop=True)

In [61]:
full_transactions.drop_duplicates(subset=['element_key','timestamp'], keep='last', inplace=True)

In [62]:
full_transactions['day_year'] = full_transactions.timestamp.dt.dayofyear

In [63]:
full_transactions['occu_cum'] = full_transactions.groupby(['element_key','day_year']).occupation.agg('cumsum')

In [64]:
full_transactions['occupation_perc'] = (full_transactions.occu_cum / full_transactions.parking_spaces * 100.0).map(
    "{0:.2f}".format).astype(float)

In [65]:
full_transactions.head(20)

Unnamed: 0,element_key,latitude,longitude,timestamp,parking_spaces,timestamp_sign,occupation,day_year,occu_cum,occupation_perc
1,35693,47.619158,-122.346457,2016-01-02 00:00:00,7,1.0,2.0,2,2.0,28.57
2,53549,47.628175,-122.341132,2016-01-02 00:00:00,32,1.0,1.0,2,1.0,3.12
3,11881,47.619156,-122.333107,2016-01-02 02:00:00,10,1.0,1.0,2,1.0,10.0
4,9393,47.621441,-122.33597,2016-01-02 03:00:00,5,1.0,1.0,2,1.0,20.0
5,11133,47.619815,-122.348131,2016-01-02 04:00:00,5,1.0,1.0,2,1.0,20.0
6,31310,47.619256,-122.339661,2016-01-02 04:00:00,11,1.0,1.0,2,1.0,9.09
8,13130,47.620816,-122.345711,2016-01-02 04:00:00,9,1.0,2.0,2,2.0,22.22
9,53126,47.616374,-122.341452,2016-01-02 04:00:00,11,1.0,1.0,2,1.0,9.09
11,36142,47.617287,-122.338056,2016-01-02 05:00:00,6,1.0,1.0,2,1.0,16.67
12,76433,47.622804,-122.33986,2016-01-02 05:00:00,11,1.0,1.0,2,1.0,9.09


## Completamos la serie con datos meteorológicos

In [68]:
meteo = pd.read_csv(os.path.join('./data/seattleWeather_1948-2017.csv'))
meteo.columns = meteo.columns.map(lambda x: camelToSnake(x))
meteo.date = pd.to_datetime(meteo.date, format="%Y-%m-%d")

meteo = meteo.loc[meteo['date'].dt.year == 2016]
meteo['day_year'] = meteo.date.dt.dayofyear

In [69]:
meteo.head()

Unnamed: 0,date,prcp,tmax,tmin,rain,day_year
24837,2016-01-01,0.0,46,28,False,1
24838,2016-01-02,0.0,42,25,False,2
24839,2016-01-03,0.02,40,31,True,3
24840,2016-01-04,0.15,38,35,True,4
24841,2016-01-05,0.11,46,36,True,5


In [71]:
full_transactions_meteo = pd.merge(full_transactions, 
                                   meteo, on='day_year', how='inner', validate='many_to_one')

In [77]:
full_transactions_meteo[['element_key','latitude','longitude','timestamp','occupation_perc',
                         'prcp','tmax','tmin']].to_csv('./data/Serie_Total2016.csv')