# Table of Contents
<p>
<div class="lev1 toc-item">
    <a href="#Extracción-de-transacciones-de-2016" data-toc-modified-id="Extracción-de-transacciones-de-2016">
        <span class="toc-item-num">1&nbsp;&nbsp;</span>
        Extracción de transacciones de 2016</a></div>

<div class="lev1 toc-item">
    <a href="#Limpieza-y-transformación-de-transacciones" data-toc-modified-id="Limpieza-y-transformación-de-transacciones">
    <span class="toc-item-num">2&nbsp;&nbsp;</span>
    Limpieza y transformación de transacciones</a></div>
    
<div class="lev1 toc-item">
    <a href="#Extracción-de-datos-de-capacidad-de-plazas" data-toc-modified-id="Extracción-de-datos-de-capacidad-de-plazas">
    <span class="toc-item-num">3&nbsp;&nbsp;</span>
    Extracción de datos de capacidad de plazas</a></div>
    
<div class="lev1 toc-item">
    <a href="#Extracción-de-datos-de-ubicación-geográfica" data-toc-modified-id="Extracción-de-datos-de-ubicación-geográfica">
    <span class="toc-item-num">4&nbsp;&nbsp;</span>
    Extracción de datos de ubicación geográfica</a></div>
    
<div class="lev1 toc-item">
    <a href="#Relación-de-transacciones,-ubicaciones-y-capacidad" data-toc-modified-id="Relación-de-transacciones,-ubicaciones-y-capacidad">
    <span class="toc-item-num">5&nbsp;&nbsp;</span>
    Relación de transacciones, ubicaciones y capacidad</a></div>
          
<div class="lev1 toc-item">
    <a href="#Construcción-de-la-serie-de-ocupación-por-timestamp" data-toc-modified-id="Construcción-de-la-serie-de-ocupación-por-timestamp">
    <span class="toc-item-num">6&nbsp;&nbsp;</span>
    Construcción de la serie de ocupación por timestamp</a></div>

<div class="lev1 toc-item">
    <a href="#Corrección-de-la-capacidad-de-plazas-disponibles" data-toc-modified-id="Corrección-de-la-capacidad-de-plazas-disponibles">
    <span class="toc-item-num">7&nbsp;&nbsp;</span>
    Corrección de la capacidad de plazas disponibles</a></div>

<div class="lev1 toc-item">
    <a href="#Completamos-la-serie-con-datos-meteorológicos" data-toc-modified-id="Completamos-la-serie-con-datos-meteorológicos">
    <span class="toc-item-num">8&nbsp;&nbsp;</span>
    Completamos la serie con datos meteorológicos</a></div>

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
% matplotlib inline

## Extracción de transacciones de 2016

In [2]:
df = pd.read_csv(os.path.join('./data/ParkingTransaction_2016_cleaned.csv'), 
                           parse_dates=['TransactionDateTime'])

In [3]:
df['TransactionDateTime'].describe()

count                10935395
unique                5505469
top       2016-04-08 12:03:59
freq                       63
first     2016-01-01 11:49:15
last      2016-12-31 20:00:51
Name: TransactionDateTime, dtype: object

In [4]:
import re

_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')

def camelToSnake(s):
    subbed = _underscorer1.sub(r'\1_\2', s)
    return _underscorer2.sub(r'\1_\2', subbed).lower()

In [5]:
df.columns = df.columns.map(lambda x: camelToSnake(x))

df.sort_values('transaction_date_time', inplace=True)

# creamos nueva columna 'final_date_time'
df['final_date_time'] = df.transaction_date_time.add(pd.to_timedelta(df.duration_mins, unit="m"))

df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10935395 entries, 8 to 10912795
Data columns (total 12 columns):
unnamed: 0               int64
transaction_id           int64
transaction_date_time    datetime64[ns]
transaction_date         object
time_start               object
time_expired             object
duration_mins            int64
amount                   float64
payment_mean             object
meter_code               int64
element_key              int64
final_date_time          datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(5), object(4)
memory usage: 3.4 GB


In [6]:
df = df.drop(columns=['unnamed: 0','transaction_date','amount','payment_mean',
                      'meter_code','time_start','time_expired'])

In [7]:
df.head()

Unnamed: 0,transaction_id,transaction_date_time,duration_mins,element_key,final_date_time
8,183506786,2016-01-01 11:49:15,0,54730,2016-01-01 11:49:15
4,183511748,2016-01-01 12:36:04,0,88773,2016-01-01 12:36:04
17,183511753,2016-01-01 12:36:08,0,88773,2016-01-01 12:36:08
15,183529739,2016-01-01 12:40:30,0,57354,2016-01-01 12:40:30
12,183527118,2016-01-01 14:52:45,0,76102,2016-01-01 14:52:45


In [8]:
df.tail()

Unnamed: 0,transaction_id,transaction_date_time,duration_mins,element_key,final_date_time
10909957,313159887,2016-12-31 19:58:49,2,81194,2016-12-31 20:00:49
10918741,313159702,2016-12-31 19:58:53,2,79570,2016-12-31 20:00:53
10910192,313159980,2016-12-31 19:59:24,1,1234,2016-12-31 20:00:24
10907062,313159989,2016-12-31 19:59:45,0,43914,2016-12-31 19:59:45
10912795,313160195,2016-12-31 20:00:51,0,8306,2016-12-31 20:00:51


### Limpieza y transformación de transacciones

In [9]:
df.transaction_id.duplicated().sum()  # no hay transacciones duplicadas

0

In [10]:
df.isnull().sum()  # no hay valores nulos

transaction_id           0
transaction_date_time    0
duration_mins            0
element_key              0
final_date_time          0
dtype: int64

#### Duración incorrecta

In [11]:
# Observamos algunas transacciones con duración = 0
df.duration_mins.loc[df.duration_mins == 0].count()

9086

In [12]:
# Incluso hay algunas transacciones con duración negativa
df.duration_mins.loc[df.duration_mins < 0].count()

1118

In [13]:
# Eliminamos ambos casos que suponen menos de un 0.1% del total
df = df.loc[df.duration_mins > 0]

#### Transacciones de larga duración

In [14]:
# Procesado de transacciones con distinta fecha de inicio y fin
long_trans = df.loc[df.transaction_date_time.dt.date != df.final_date_time.dt.date]
print(long_trans.shape)
indexes = df.loc[df.transaction_date_time.dt.date != df.final_date_time.dt.date].index.get_values()
df = df.drop(index=indexes, axis=1)

(13403, 5)


In [15]:
long_trans_dup = long_trans.copy()
long_trans_dup = long_trans_dup.loc[long_trans_dup.final_date_time.dt.hour >= 8]
long_trans.final_date_time = long_trans.transaction_date_time.apply(
    lambda x: x.replace(hour=20,minute=0,second=0,microsecond=0))
long_trans_dup.transaction_date_time = long_trans_dup.final_date_time.apply(
    lambda x: x.replace(hour=8,minute=0,second=0,microsecond=0))

In [16]:
long_trans_full = pd.concat([long_trans,long_trans_dup], ignore_index=True, sort=True)
df = pd.concat([df,long_trans_full], ignore_index=True, sort=True)

#### Horario de funcionamiento de los parquímetros

In [17]:
# Eliminamos las transacciones con inicio y fin antes de las 08:00
indexes = df.loc[(df.transaction_date_time.dt.hour < 8) & (df.final_date_time.dt.hour < 8)].index.get_values()
print(indexes.size)
df = df.drop(index=indexes, axis=1)

55413


In [18]:
# Eliminamos las transacciones con inicio y fin después de las 20:00
indexes = df.loc[(df.transaction_date_time.dt.hour >= 20) & (df.final_date_time.dt.hour >= 20)].index.get_values()
print(indexes.size)
df = df.drop(index=indexes, axis=1)

17107


In [19]:
# Redondeo la hora (sin minutos o segundos) y fijo a 08:00 la hora mínima de comienzo
df.transaction_date_time = np.where(df.transaction_date_time.dt.hour < 8, 
                                    df.transaction_date_time.apply(lambda x: x.replace(hour=8,minute=0,second=0,microsecond=0)), 
                                    df.transaction_date_time.apply(lambda x: x.replace(minute=0,second=0,microsecond=0)))

In [20]:
# Redondeo la hora final (sin minutos o segundos) y fijo a 20:00 la hora máxima final
df.final_date_time = np.where(df.final_date_time.dt.hour > 20,
                              df.final_date_time.apply(lambda x: x.replace(hour=20,minute=0,second=0,microsecond=0)), 
                              df.final_date_time.apply(lambda x: x.replace(minute=0,second=0,microsecond=0)))

In [21]:
# Comprobamos que no hay transacciones fuera del rango horario de uso de los parquímetros (de 8 a 20h)
df.element_key.loc[((df.transaction_date_time.dt.hour > 20) | (df.transaction_date_time.dt.hour < 8)) | 
       ((df.final_date_time.dt.hour > 20) | (df.final_date_time.dt.hour < 8))].count()

0

#### Domingos o festivos

In [22]:
# Observamos algunas transacciones realizadas por error en domingo
df.transaction_date_time.loc[df.transaction_date_time.dt.weekday == 6].count()

239

In [23]:
# Las eliminamos también
df = df.loc[df.transaction_date_time.dt.weekday != 6]

In [24]:
# Y observamos transacciones en días festivos
import holidays

hol = holidays.US(state='WA', years=[2016]).items()
hol_dates = []
for dat, name in sorted(hol):
    hol_dates.append(dat)

df.transaction_date_time.loc[df.transaction_date_time.dt.date.isin(hol_dates)].count()

32871

In [25]:
# Las eliminamos también
df = df.loc[~df.transaction_date_time.dt.date.isin(hol_dates)]

In [26]:
df.shape  

(10822236, 5)

In [27]:
# Con la limpieza hemos reducido el tamaño del dataset inicial en un 1%
(10935395-(df.shape[0]-long_trans_dup.shape[0]))/10935395*100

1.0592575759723357

## Extracción de datos de capacidad de plazas

In [28]:
blocks = pd.read_csv(os.path.join('./data/Blockface_cleaned.csv'))

In [29]:
blocks.shape

(13706, 39)

In [30]:
blocks.columns = blocks.columns.map(lambda x: camelToSnake(x))

In [31]:
blocks.loc[blocks.element_key == 1001]

Unnamed: 0,pay_station_blockface_id,element_key,parking_spaces,paid_parking_area,parking_time_limit_category,peak_hour_start1,peak_hour_end1,peak_hour_start2,peak_hour_end2,paid_area_start_time,...,saturday_start1,saturday_end1,saturday_rate2,saturday_start2,saturday_end2,saturday_rate3,saturday_start3,saturday_end3,start_time_saturday,end_time_saturday
469,7576,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.0,11:00:00,18:00:00,,,,08:00:00,18:00:00
2783,10071,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
3017,10262,1001,4.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
4648,11976,1001,4.0,,120.0,06:00:00,09:00:00,15:00:00,18:00:00,,...,00:00:00,00:00:00,0.0,00:00:00,00:00:00,0.0,00:00:00,00:00:00,,
5989,13670,1001,4.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
6301,14348,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,5.0,11:00:00,18:00:00,,,,08:00:00,18:00:00
7077,13939,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,4.5,11:00:00,18:00:00,,,,08:00:00,18:00:00
8059,3405,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,,,,,,,,,,
10576,1352,1001,7.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,,,,,,,,,,
11332,6046,1001,5.0,Pioneer Square,120.0,06:00:00,09:00:00,15:00:00,18:00:00,08:00:00,...,08:00:00,11:00:00,3.5,11:00:00,18:00:00,,,,08:00:00,18:00:00


In [32]:
blocks.isnull().sum()

pay_station_blockface_id           0
element_key                        0
parking_spaces                    14
paid_parking_area               1831
parking_time_limit_category     1330
peak_hour_start1               12368
peak_hour_end1                 12368
peak_hour_start2               13209
peak_hour_end2                 13209
paid_area_start_time            1831
paid_area_end_time              1831
effective_start_date               0
effective_end_date              1707
paid_parking_rate               9057
parking_category                   0
load                               0
zone                               0
weekday_rate1                   4677
weekday_start1                  4677
weekday_end1                    4677
weekday_rate2                   4677
weekday_start2                  4677
weekday_end2                    4677
weekday_rate3                   7532
weekday_start3                  7532
weekday_end3                    7532
start_time_weekday              6480
e

Dado que hay algunos casos como el anterior donde hay divergencia en la información de plazas asociadas a un mismo element_key, calculamos la media redondeada a un valor entero de los distintos valores existentes.

In [33]:
park_spaces = blocks[['element_key','parking_spaces','paid_parking_area']]
park_spaces = park_spaces.dropna()
park_spaces = park_spaces.groupby(['element_key','paid_parking_area'])['parking_spaces'].max()

In [34]:
park_spaces = park_spaces.reset_index()

In [35]:
park_spaces[['element_key','paid_parking_area']].duplicated().sum()

0

In [36]:
park_spaces.shape

(1709, 3)

In [37]:
park_spaces[park_spaces.element_key.duplicated(keep=False)]

Unnamed: 0,element_key,paid_parking_area,parking_spaces
194,9762,Commercial Core,4.0
195,9762,Pioneer Square,4.0
1439,78114,Belltown,15.0
1440,78114,Commercial Core,17.0


In [38]:
park_spaces.parking_spaces.describe().astype(str)

count               1709.0
mean     8.537741369221767
std      5.441127087493119
min                    0.0
25%                    5.0
50%                    8.0
75%                   10.0
max                   63.0
Name: parking_spaces, dtype: object

In [39]:
park_spaces.parking_spaces.sort_values(ascending=False).head(10)  # el valor de 63 podría ser un outlier

1161    63.0
1685    50.0
754     48.0
1683    47.0
753     45.0
1673    40.0
536     39.0
919     36.0
1682    36.0
1365    34.0
Name: parking_spaces, dtype: float64

In [40]:
park_spaces.element_key.loc[park_spaces.parking_spaces == 0].count()

6

In [41]:
# Eliminamos los registros que no tienen información de la capacidad de plazas
park_spaces = park_spaces.loc[park_spaces.parking_spaces != 0]

In [42]:
park_spaces.head()

Unnamed: 0,element_key,paid_parking_area,parking_spaces
0,1001,Pioneer Square,7.0
1,1002,Pioneer Square,9.0
2,1005,Commercial Core,8.0
3,1006,Commercial Core,6.0
4,1009,Commercial Core,5.0


## Extracción de datos de ubicación geográfica

In [43]:
coord = pd.read_csv(os.path.join('./data/Coord_EK.csv'))

In [44]:
coord.head()

Unnamed: 0,element_key,latitude,longitude
0,1001,47.602862,-122.334703
1,1002,47.602997,-122.334538
2,1005,47.603602,-122.335382
3,1006,47.603725,-122.335171
4,1009,47.60501,-122.336669


In [45]:
coord.shape

(1517, 3)

In [46]:
coord.duplicated().sum()  # no hay registros duplicados

0

In [47]:
coord.element_key.duplicated().sum()  # tampoco hay duplicados de Element Key

0

In [48]:
coord[['latitude','longitude']].duplicated().sum()  # ni hay ubicaciones repetidas

0

#### Análisis de ubicación de element_key duplicados en park_spaces

In [49]:
coord.loc[(coord.element_key == 9762) | (coord.element_key == 78114)]

Unnamed: 0,element_key,latitude,longitude
175,9762,47.602669,-122.336182
1270,78114,47.609074,-122.343332


Utilizamos la web https://www.coordenadas-gps.com para analizar las ubicaciones y decidimos asociar ambos segmentos de calle con el distrito Commercial Core:

In [50]:
park_spaces = park_spaces[(park_spaces.index != 195) & (park_spaces.index != 1439)]

## Relación de transacciones, ubicaciones y capacidad

In [51]:
df.element_key.unique().size

1514

In [52]:
coord.element_key.unique().size

1517

In [53]:
park_spaces.element_key.unique().size

1701

In [54]:
len(set(coord.element_key).intersection(set(df.element_key)))

1445

In [55]:
len(set(park_spaces.element_key).intersection(set(coord.element_key)))

1498

**Mezclamos los datasets:**

In [56]:
# Transacciones con Coordenadas
df_c = pd.merge(df, coord, on='element_key', how='inner', validate='many_to_one')

In [57]:
# Transacciones con Coordenadas y con Capacidad de Plazas disponibles
df_cp = pd.merge(df_c, park_spaces, on='element_key', how='inner', validate='many_to_one')

In [58]:
df.shape, df_c.shape, df_cp.shape

((10822236, 5), (10657805, 7), (10652430, 9))

In [59]:
#df_cp.set_index('transaction_id', inplace=True)
df_cp.head()

Unnamed: 0,duration_mins,element_key,final_date_time,transaction_date_time,transaction_id,latitude,longitude,paid_parking_area,parking_spaces
0,598,35693,2016-01-02 10:00:00,2016-01-02 08:00:00,64059668,47.619158,-122.346457,Uptown Triangle,10.0
1,598,35693,2016-01-02 10:00:00,2016-01-02 08:00:00,64059669,47.619158,-122.346457,Uptown Triangle,10.0
2,370,35693,2016-01-02 14:00:00,2016-01-02 08:00:00,64084672,47.619158,-122.346457,Uptown Triangle,10.0
3,60,35693,2016-01-02 09:00:00,2016-01-02 08:00:00,183563751,47.619158,-122.346457,Uptown Triangle,10.0
4,120,35693,2016-01-02 10:00:00,2016-01-02 08:00:00,64084673,47.619158,-122.346457,Uptown Triangle,10.0


In [60]:
df_cp.element_key.unique().size

1443

## Construcción de la serie de ocupación por timestamp

Construimos primero la tabla corta que usaremos para montar las series temporales:

In [61]:
in_cols = ['element_key', 'latitude', 'longitude', 'transaction_date_time', 'parking_spaces','paid_parking_area']
out_cols = ['element_key', 'latitude', 'longitude', 'final_date_time', 'parking_spaces','paid_parking_area']

df_in = df_cp[in_cols].reset_index(drop=True)
#df_in.transaction_id = df_in.transaction_id.map(lambda s:'%s_in' % str(s))
df_in['timestamp_sign'] = 1.0
df_in.rename(columns={'transaction_date_time': 'timestamp'}, inplace=True)

df_out = df_cp[out_cols].reset_index(drop=True)
#df_out.transaction_id = df_out.transaction_id.map(lambda s:'%s_out' % str(s))
df_out['timestamp_sign'] = - 1.0
df_out.rename(columns={'final_date_time': 'timestamp'}, inplace=True)  # inplace = True, value of copy is ignored

In [62]:
df_in.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,parking_spaces,paid_parking_area,timestamp_sign
0,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0
1,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0
2,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0
3,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0
4,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0


In [63]:
df_out.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,parking_spaces,paid_parking_area,timestamp_sign
0,35693,47.619158,-122.346457,2016-01-02 10:00:00,10.0,Uptown Triangle,-1.0
1,35693,47.619158,-122.346457,2016-01-02 10:00:00,10.0,Uptown Triangle,-1.0
2,35693,47.619158,-122.346457,2016-01-02 14:00:00,10.0,Uptown Triangle,-1.0
3,35693,47.619158,-122.346457,2016-01-02 09:00:00,10.0,Uptown Triangle,-1.0
4,35693,47.619158,-122.346457,2016-01-02 10:00:00,10.0,Uptown Triangle,-1.0


In [64]:
full_transactions = pd.concat([df_in, df_out])
full_transactions.sort_values('timestamp', ascending=True, inplace=True)

In [65]:
full_transactions.head()

Unnamed: 0,element_key,latitude,longitude,timestamp,parking_spaces,paid_parking_area,timestamp_sign
0,35693,47.619158,-122.346457,2016-01-02 08:00:00,10.0,Uptown Triangle,1.0
1878023,46254,47.611498,-122.343153,2016-01-02 08:00:00,10.0,Belltown,1.0
1878022,46254,47.611498,-122.343153,2016-01-02 08:00:00,10.0,Belltown,1.0
1878021,46254,47.611498,-122.343153,2016-01-02 08:00:00,10.0,Belltown,1.0
3727491,25710,47.613757,-122.344972,2016-01-02 08:00:00,11.0,Belltown,1.0


In [66]:
df_in.shape, df_out.shape, full_transactions.shape

((10652430, 7), (10652430, 7), (21304860, 7))

#### Añadimos porcentaje de ocupación

In [67]:
# COMANDO A REVISAR OPTIMIZACIÓN - LE HA COSTADO
full_transactions['occupation'] = full_transactions.groupby(['element_key', 'timestamp']).timestamp_sign.transform(
    lambda series: series.cumsum())

In [68]:
full_transactions = full_transactions.reset_index(drop=True)

In [69]:
full_transactions.drop_duplicates(subset=['element_key','timestamp'], keep='last', inplace=True)

In [70]:
full_transactions['day_year'] = full_transactions.timestamp.dt.dayofyear

In [71]:
full_transactions['occu_cum'] = full_transactions.groupby(['element_key','day_year']).occupation.agg('cumsum')

In [72]:
full_transactions['occupation_perc'] = (full_transactions.occu_cum / full_transactions.parking_spaces * 100.0).map(
    "{0:.2f}".format).astype(float)

In [73]:
full_transactions.element_key.loc[full_transactions.occupation_perc > 100].count()

204324

In [74]:
full_transactions.element_key.loc[full_transactions.occupation_perc > 100].unique().size

1174

In [75]:
full_transactions.shape

(4124933, 11)

## Corrección de la capacidad de plazas disponibles

In [76]:
new_park_spaces = pd.read_csv(os.path.join('./data/StreetParking.csv'))

In [77]:
new_park_spaces.head()

Unnamed: 0,element_key,parking_category,total_nopark,total_zones,parking_spaces,total_spaces
0,32018,Paid Parking,8,1,6,15
1,1042,No Parking Allowed,5,1,0,6
2,48129,Paid Parking,6,1,4,11
3,47962,Paid Parking,8,2,7,17
4,47966,Paid Parking,5,1,4,10


In [78]:
# Eliminamos los registros que no tienen información de la capacidad de plazas
new_park_spaces = new_park_spaces.loc[new_park_spaces.total_spaces != 0]

In [79]:
new_park_spaces = pd.merge(new_park_spaces, park_spaces, on='element_key', how='inner', validate='many_to_one')

In [80]:
new_park_spaces.head()

Unnamed: 0,element_key,parking_category,total_nopark,total_zones,parking_spaces_x,total_spaces,paid_parking_area,parking_spaces_y
0,32018,Paid Parking,8,1,6,15,Belltown,6.0
1,48129,Paid Parking,6,1,4,11,Pioneer Square,6.0
2,47962,Paid Parking,8,2,7,17,Commercial Core,8.0
3,47966,Paid Parking,5,1,4,10,Commercial Core,9.0
4,8617,Paid Parking,7,2,4,13,Denny Triangle,4.0


In [81]:
new_park_spaces.element_key.loc[new_park_spaces.parking_spaces_x == new_park_spaces.parking_spaces_y].count()

1055

In [82]:
new_park_spaces.shape

(1694, 8)

In [83]:
new_park_spaces = new_park_spaces.drop(columns=['total_nopark','total_zones','parking_spaces_x','parking_spaces_y',
                      'paid_parking_area'])

In [84]:
new_park_spaces.head()

Unnamed: 0,element_key,parking_category,total_spaces
0,32018,Paid Parking,15
1,48129,Paid Parking,11
2,47962,Paid Parking,17
3,47966,Paid Parking,10
4,8617,Paid Parking,13


In [85]:
full_transactions = full_transactions.drop(columns=['occupation_perc','timestamp_sign','occupation','parking_spaces'])
full_transactions = pd.merge(full_transactions, new_park_spaces, on='element_key', how='inner', validate='many_to_one')
full_transactions['occupation_perc'] = (full_transactions.occu_cum / full_transactions.total_spaces * 100.0).map(
    "{0:.2f}".format).astype(float)

In [86]:
full_transactions.element_key.loc[full_transactions.occupation_perc > 100].count()

12691

In [87]:
full_transactions.element_key.loc[full_transactions.occupation_perc > 100].unique().size

194

In [88]:
# Eliminamos los registros asociados a element_keys con ocupación > 100%
ek_del = full_transactions.element_key.loc[full_transactions.occupation_perc > 100].unique()
full_transactions = full_transactions.loc[~full_transactions.element_key.isin(ek_del)]

In [89]:
full_transactions.element_key.unique().size

1249

In [90]:
full_transactions.parking_category.value_counts()/full_transactions.shape[0]

Paid Parking               0.933928
Restricted Parking Zone    0.046733
Carpool Parking            0.011140
No Parking Allowed         0.008199
Name: parking_category, dtype: float64

In [91]:
# Nos quedamos con la categoría mayoritaria Paid Parking
full_transactions = full_transactions.loc[full_transactions.parking_category == 'Paid Parking']

## Completamos la serie con datos meteorológicos

In [92]:
meteo = pd.read_csv(os.path.join('./data/seattleWeather_1948-2017.csv'))
meteo.columns = meteo.columns.map(lambda x: camelToSnake(x))
meteo.date = pd.to_datetime(meteo.date, format="%Y-%m-%d")

meteo = meteo.loc[meteo['date'].dt.year == 2016]
meteo['day_year'] = meteo.date.dt.dayofyear

In [93]:
meteo.shape

(366, 6)

In [94]:
full_transactions_meteo = pd.merge(full_transactions, 
                                   meteo, on='day_year', how='inner', validate='many_to_one')

In [97]:
full_transactions_meteo[['element_key','latitude','longitude','timestamp','paid_parking_area','day_year',
                         'occupation_perc','prcp','tmax','tmin']].to_csv('./data/Serie_Total2016.csv', index=False)