In [1]:
import pandas as pd
import numpy as np

In [81]:
from functools import reduce

# Lectura de Datos

In [2]:
df = pd.read_parquet('df_viajes_G28.parquet')

In [3]:
df.shape

(17104571, 7)

In [4]:
df.head()

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,fh_retiro,fh_arribo
0,M,44,10225,169,447,2017-11-01 00:00:09,2017-11-01 00:11:47
1,M,35,8106,359,338,2017-11-01 00:02:16,2017-11-01 00:05:54
2,M,27,10899,84,150,2017-11-01 00:02:54,2017-11-01 00:06:30
3,M,27,9931,137,182,2017-11-01 00:03:12,2017-11-01 00:19:02
4,M,23,8934,134,116,2017-11-01 00:03:54,2017-11-01 00:10:24


In [5]:
df.dtypes

Genero_Usuario                   object
Edad_Usuario                      int32
Bici                             object
Ciclo_Estacion_Retiro             int32
Ciclo_Estacion_Arribo             int32
fh_retiro                datetime64[ns]
fh_arribo                datetime64[ns]
dtype: object

# Ingeniaría de Variables

* Unidad Muestral .- Ciclo Estación en una fecha en una hora específica
* Generar variables
* Matriz de predictoras
* Variables objetivo
* Ventanas de tiempo
* TAD

In [6]:
um = ['Ciclo_Estacion_Retiro','hrs']

In [8]:
df['hrs'] = df['fh_retiro'].map( lambda x: x.strftime('%Y%m%d%H')  ).astype(int)

In [9]:
df.head()

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,fh_retiro,fh_arribo,hrs
0,M,44,10225,169,447,2017-11-01 00:00:09,2017-11-01 00:11:47,2017110100
1,M,35,8106,359,338,2017-11-01 00:02:16,2017-11-01 00:05:54,2017110100
2,M,27,10899,84,150,2017-11-01 00:02:54,2017-11-01 00:06:30,2017110100
3,M,27,9931,137,182,2017-11-01 00:03:12,2017-11-01 00:19:02,2017110100
4,M,23,8934,134,116,2017-11-01 00:03:54,2017-11-01 00:10:24,2017110100


In [11]:
# Agrupar los viajes para la demanda de ecobicis por estación de retiro en cada fecha-hora
df_g = df.groupby( um )['Bici'].count().reset_index()

In [13]:
df_g.sample(3)

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,Bici
638548,56,2019080611,3
730480,64,2019060710,10
3349418,336,2019062706,1
2841515,280,2019091812,1
3452842,349,2019010416,4
1502022,138,2018101817,3
2038863,189,2018041316,2
820435,72,2018112108,13
1675795,153,2019022416,4
668893,59,2018112214,8


### Definición variables

###### y = H( X )

* Promedio de la edad en Ciclo estación fecha hora (por unidad muestral)
* Porcentaje de hombres por UM
* Número de hombres
* Número de mujeres
* Promedio de minutos de los viajes a nivel UM 
* Promedio, Max, min, desv, .... 

### Nomenclatura de variables:

* v_ -> númericas
* c_ -> categóricas
* fh_ -> fechas
* bl_ -> booleanas

------------------------------------------------------------

# Construcción TAD

## Variable Objetivo

In [17]:
df_g.rename(columns={'Bici':'v_num_viajes'},inplace=True)

In [19]:
df_g.head()

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_num_viajes
0,1,2017103119,1
1,1,2017110105,3
2,1,2017110106,9
3,1,2017110107,14
4,1,2017110108,42


### Promedio de la edad

In [50]:
df_prom_edad = df.groupby(um)['Edad_Usuario'].mean().reset_index()

In [66]:
df_prom_edad.rename(columns={'Edad_Usuario':'v_edad_prom'},inplace=True)

In [67]:
df_prom_edad.head(2)

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_edad_prom
0,1,2017103119,35.0
1,1,2017110105,35.666667


### Numeralia de genero

In [None]:
aux = df.pivot_table( index=um , columns=['Genero_Usuario'] , values=['Bici'] , aggfunc='count' , fill_value=0 )
aux['v_personas_tot'] = aux.sum(axis=1)
aux.columns = [c[0]+'_'+c[1] if c[1] != '' else c[0] for c in aux.columns]
aux['v_pct_F'] = np.where( aux['v_personas_tot'] == 0 , 0, aux['Bici_F'] / aux['v_personas_tot'] )
aux['v_pct_M'] = np.where( aux['v_personas_tot'] == 0 , 0, aux['Bici_M'] / aux['v_personas_tot'] )

In [46]:
df_pct_genero = aux.reset_index()

In [70]:
df_pct_genero.rename(columns={'Bici_F':'v_F_tot','Bici_M':'v_M_tot'},inplace=True)

In [71]:
df_pct_genero.head(2)

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_F_tot,v_M_tot,v_personas_tot,v_pct_F,v_pct_M
0,1,2017103119,0,1,1,0.0,1.0
1,1,2017110105,0,3,3,0.0,1.0


### Tiempo de viajes

In [55]:
df['v_tiempo_viaje'] = (df['fh_arribo'] - df['fh_retiro']  ) / np.timedelta64(1,'m')

In [58]:
df_tiempos = df.groupby(um).agg({'v_tiempo_viaje':[ 'min', 'mean', 'max', 'std' ]}).reset_index()

In [62]:
df_tiempos.columns = [c[0]+'_'+c[1] if c[1] != '' else c[0] for c in df_tiempos.columns]

In [64]:
df_tiempos.head(2)

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_tiempo_viaje_min,v_tiempo_viaje_mean,v_tiempo_viaje_max,v_tiempo_viaje_std
0,1,2017103119,663.666667,663.666667,663.666667,
1,1,2017110105,5.316667,8.077778,11.883333,3.405646


### Nivel demanda

In [None]:
# <=5 := BAJA
# 5> <=10 := MEDIA
# 10> := ALTA

In [73]:
def func_demanda(num_viajes):

    if num_viajes <= 5:
        r = "BAJA"
    elif num_viajes <= 10:
        r = "MEDIA"
    elif num_viajes > 10:
        r = "ALTA"
    else:
        r = "ERROR"
    
    return r

In [78]:
df_g['c_nivel_demanda'] = df_g['v_num_viajes'].map(lambda x: func_demanda(x) )

In [79]:
df_g.head()df_g

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_num_viajes,c_nivel_demanda
0,1,2017103119,1,BAJA
1,1,2017110105,3,BAJA
2,1,2017110106,9,MEDIA
3,1,2017110107,14,ALTA
4,1,2017110108,42,ALTA


# Matriz de predictoras

In [82]:
X_ = reduce( lambda x,y: pd.merge( x,y, on=um, how='outer' ) , [df_prom_edad, df_pct_genero, df_tiempos, df_g] )

In [83]:
X_.shape

(4409852, 14)

In [89]:
X_.rename(columns={'v_num_viajes':'y'},inplace=True)

In [90]:
X_.to_parquet("df_um_G28.parquet")

In [95]:
cols_X = [c for c in X_.columns.tolist() if c != 'y']

In [96]:
### UM + Variables predictoras = Matriz de predictoras
X_[cols_X]

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,v_edad_prom,v_F_tot,v_M_tot,v_personas_tot,v_pct_F,v_pct_M,v_tiempo_viaje_min,v_tiempo_viaje_mean,v_tiempo_viaje_max,v_tiempo_viaje_std,c_nivel_demanda
0,1,2017103119,35.000000,0,1,1,0.000000,1.000000,663.666667,663.666667,663.666667,,BAJA
1,1,2017110105,35.666667,0,3,3,0.000000,1.000000,5.316667,8.077778,11.883333,3.405646,BAJA
2,1,2017110106,29.555556,2,7,9,0.222222,0.777778,3.200000,31.333333,206.033333,65.718493,MEDIA
3,1,2017110107,37.285714,1,13,14,0.071429,0.928571,4.333333,11.029762,24.633333,6.082493,ALTA
4,1,2017110108,35.857143,11,31,42,0.261905,0.738095,3.450000,11.424603,35.066667,6.260715,ALTA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4409847,3001,2019031411,59.000000,0,4,4,0.000000,1.000000,4.133333,5.029167,5.850000,0.774402,BAJA
4409848,3001,2019080911,25.000000,0,1,1,0.000000,1.000000,4222.033333,4222.033333,4222.033333,,BAJA
4409849,3001,2019090206,29.000000,0,1,1,0.000000,1.000000,33.550000,33.550000,33.550000,,BAJA
4409850,3002,2019101014,32.000000,0,1,1,0.000000,1.000000,72.233333,72.233333,72.233333,,BAJA


In [99]:
### UM + Varaible objetivo
X_[um + ['y']]

Unnamed: 0,Ciclo_Estacion_Retiro,hrs,y
0,1,2017103119,1
1,1,2017110105,3
2,1,2017110106,9
3,1,2017110107,14
4,1,2017110108,42
...,...,...,...
4409847,3001,2019031411,4
4409848,3001,2019080911,1
4409849,3001,2019090206,1
4409850,3002,2019101014,1


# Catálogo de tiempo