# Parking Data aggregation

## Prueba para 1 dataset (proceso descriptivo)

In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# leemos el dataset para ver su formato
df = pd.read_csv('./data/parkings_JULIO/101Libertad.csv', sep=";")
print(df)

         _id  recvTimeTs             recvTime          entityId  attrValue
0     222747  1594637257  2020-07-13T10:47:37  Aparcamiento:101         46
1     222748  1594637498  2020-07-13T10:51:38  Aparcamiento:101         51
2     222749  1594637618  2020-07-13T10:53:38  Aparcamiento:101         52
3     222750  1594637738  2020-07-13T10:55:38  Aparcamiento:101         49
4     222751  1594637858  2020-07-13T10:57:38  Aparcamiento:101         55
...      ...         ...                  ...               ...        ...
7038  229785  1596235592  2020-07-31T22:46:32  Aparcamiento:101        316
7039  229786  1596235953  2020-07-31T22:52:33  Aparcamiento:101        317
7040  229787  1596236673  2020-07-31T23:04:33  Aparcamiento:101        316
7041  229788  1596236793  2020-07-31T23:06:33  Aparcamiento:101        319
7042  229789  1596239553  2020-07-31T23:52:33  Aparcamiento:101        320

[7043 rows x 5 columns]


In [3]:
# transformamos una de sus fechas de string a fecha
datetime_str = '2020-07-13T10:47:37'
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S')
print(datetime_object)

2020-07-13 10:47:37


In [4]:
# creamos un parser para leer bien las fechas
def parser(x):
    return datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')

df = pd.read_csv('./data/parkings_JULIO/101Libertad.csv', sep=";", parse_dates=[2], date_parser=parser)
print(df)


         _id  recvTimeTs            recvTime          entityId  attrValue
0     222747  1594637257 2020-07-13 10:47:37  Aparcamiento:101         46
1     222748  1594637498 2020-07-13 10:51:38  Aparcamiento:101         51
2     222749  1594637618 2020-07-13 10:53:38  Aparcamiento:101         52
3     222750  1594637738 2020-07-13 10:55:38  Aparcamiento:101         49
4     222751  1594637858 2020-07-13 10:57:38  Aparcamiento:101         55
...      ...         ...                 ...               ...        ...
7038  229785  1596235592 2020-07-31 22:46:32  Aparcamiento:101        316
7039  229786  1596235953 2020-07-31 22:52:33  Aparcamiento:101        317
7040  229787  1596236673 2020-07-31 23:04:33  Aparcamiento:101        316
7041  229788  1596236793 2020-07-31 23:06:33  Aparcamiento:101        319
7042  229789  1596239553 2020-07-31 23:52:33  Aparcamiento:101        320

[7043 rows x 5 columns]


In [5]:
# agregamos los datos por horas usando la media
df2 = df.set_index('recvTime').resample('H').mean()
# interpolamos los valores faltantes
# The method data.interpolate accepts the input parameter limit, which defines the maximum number of consecutive NaNs to be substituted by interpolation.
df3 = df2['attrValue'].interpolate(limit=6)
nas2 = df2['attrValue'].isna().sum()
nas3 = df3.isna().sum()
print(str((nas2-nas3)) + " rows were interpolated, representing a " + str(round((nas2-nas3)/df.shape[0]*100,3))+ "% of the whole dataset")

44 rows were interpolated, representing a 0.625% of the whole dataset


In [15]:
# functions
def parser(x):
    return datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')

def read_and_resample_and_interpol(datapath, interp = 0, pr=False, sk=-1, separador = ";", colfechas= 2):
    if(sk !=-1):
        df = pd.read_csv(datapath, sep=separador, parse_dates=[colfechas], date_parser=parser, skiprows=sk)
    else:
        df = pd.read_csv(datapath, sep=separador, parse_dates=[colfechas], date_parser=parser)
    df2 = df.set_index('recvTime').resample('H').mean()
    df4 = df2
    if(interp!=0):
        df4 = df2['attrValue'].interpolate(limit=interp)
        if(pr):
            nas2 = df2['attrValue'].isna().sum()
            nas4 = df4.isna().sum()
            print(str((nas2-nas4)) + " rows were interpolated, representing a " + str(round((nas2-nas4)/df.shape[0]*100,3))+ "% of the whole dataset")
        df4 = df4.to_frame()
    return df4

## Hacemos el resample y agregamos los datasets por zonas y fechas

In [10]:
# Zona 1: La Vega y La libertad -- espacio total = 312 + 330 = 642
lavega = read_and_resample_and_interpol('./data/parkings_JULIO/102LaVega.csv', interp=100, pr=True)
libertad = read_and_resample_and_interpol('./data/parkings_JULIO/101Libertad.csv', interp=100, pr=True)
merge=libertad.merge(lavega, how='inner', on='recvTime')
merge['free'] = merge['attrValue_x'] + merge['attrValue_y']
zone1 = merge[[ 'free']]
print(zone1)
zone1.to_csv("./data/parkings_JULIO/zone1.csv",sep=";",index=True)

86 rows were interpolated, representing a 0.874% of the whole dataset
58 rows were interpolated, representing a 0.824% of the whole dataset
                           free
recvTime                       
2020-07-13 10:00:00  167.363636
2020-07-13 11:00:00  246.157895
2020-07-13 12:00:00  351.465909
2020-07-13 13:00:00  404.080000
2020-07-13 14:00:00  362.413238
...                         ...
2020-07-31 19:00:00  512.900285
2020-07-31 20:00:00  554.235294
2020-07-31 21:00:00  572.875000
2020-07-31 22:00:00  586.000000
2020-07-31 23:00:00  592.333333

[446 rows x 1 columns]


In [40]:
# Zona 2: MoralesMeseguer -- espacio total = 220 
# no me fio de los datos de AlfonsoX ya que son pocos números alternados.
# centrofama no tiene casi datos
mm = read_and_resample_and_interpol('./data/parkings_JULIO/105MoralesMeseguer.csv',interp=10, sk=range(1, 7))
mm2 = mm['attrValue'].to_frame()
mm2.columns = ["free"]
mm2.to_csv("./data/parkings_JULIO/zone2.csv",sep=";",index=True)

In [16]:
#lavega = read_and_resample_and_interpol('./data/parkings_SEPT_OCT/102LaVega.csv', interp=100, pr=True)

lavega = read_and_resample_and_interpol('./data/parkings_SEPT_OCT/102LaVega.csv', 
                                        interp=100, pr=True, separador=",", colfechas=0)



IndexError: list index out of range

In [7]:
# Zona 1: La Vega y La libertad -- espacio total = 312 + 330 = 642
lavega = read_and_resample_and_interpol('./data/parkings_SEPT_OCT/102LaVega.csv', interp=100, pr=True)
libertad = read_and_resample_and_interpol('./data/parkings_SEPT_OCT/101Libertad.csv', interp=100, pr=True)
merge=libertad.merge(lavega, how='inner', on='recvTime')
merge['free'] = merge['attrValue_x'] + merge['attrValue_y']
zone1 = merge[[ 'free']]
print(zone1)
#zone1.to_csv("./data/parkings_SEPT_OCT/zone1.csv",sep=";",index=True)

FileNotFoundError: [Errno 2] No such file or directory: './data/parkings_SEPT_OCT/102LaVega.csv'