In [50]:
import geopandas as gpd
import numpy as np
import pandas as pd
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [128]:
def prepare_dataframe(df, modo='predict'):
    
    if ( modo == 'predict' ):
        df = df.drop(columns='p13v_condujo_bicicleta').dropna()
    elif (modo == 'train' ):
        df = df.drop(['id_hogar', 'id_persona', 'f_exp'], axis=1).dropna()
        
    df = pd.concat([df, pd.get_dummies(df['p6_id_ocupacion'])], axis=1)
    df = pd.concat([df, pd.get_dummies(df['p5_id_nivel_educativo'])], axis=1)
    df = df.drop(['p5_id_nivel_educativo', 'p6_id_ocupacion'], axis=1)
    df['Sexo'] = df['Sexo'].map({'Mujer': 0, 'Hombre': 1})
    df['vehiculo'] = df['vehiculo'].astype(int)
    
    
    if ( modo == 'predict' ):
        return df[['p4_edad', 'Sexo', 'vehiculo', 'Estudia', 'Otro', 'Trabaja', 'Media', 'Ninguno', 'Posgrado', 'Primaria', 'Superior']]
        
    elif (modo == 'train' ):
        df['p13v_condujo_bicicleta'] = df['p13v_condujo_bicicleta'].astype(int)
        return df

In [158]:
cols = ['id_hogar', 'id_persona', 'p4_edad', 'p5_id_nivel_educativo', 'p6_id_ocupacion', 
        'p13v_condujo_bicicleta', 'f_exp', 'Sexo']

df_hogares = pd.read_csv('../data/csv/HogaresEODH_2019.csv', sep=';')
df_personas = pd.read_csv('../data/csv/PersonasEODH_2019.csv', sep=';', usecols=cols)
df_viajes = pd.read_csv('../data/csv/ViajesEODH_2019.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [99]:
df_hogares['vehiculo'] = df_hogares[['p1mc_automovil', 'p1mc_camionetas', 'p1mc_pickup', 'p1mc_motocicleta']].sum(axis=1).astype(bool)

In [100]:
df = df_personas.merge(df_hogares[['Id_Hogar', 'vehiculo']], left_on='id_hogar', right_on='Id_Hogar').drop(['Id_Hogar'], axis=1)

In [101]:
df['p13v_condujo_bicicleta'] = df['p13v_condujo_bicicleta'].map({1: True, 2: False})

In [102]:
est_cod = list(range(1, 6))
est_dicc = dict.fromkeys(est_cod, 'Estudia')

trab_cod = list(range(11, 22))
trab_dicc = dict.fromkeys(trab_cod, 'Trabaja')

otro_cod = list(range(31, 39))
otro_dicc = dict.fromkeys(otro_cod, 'Otro')

d = {**est_dicc, **trab_dicc, **otro_dicc}

dicc_nivel_ed = {
    1: 'Primaria',
    2: 'Primaria',
    3: 'Primaria',
    4: 'Primaria',
    5: 'Primaria',
    6: 'Primaria',
    7: 'Media',
    8: 'Media',
    9: 'Superior',
    10: 'Media',
    11: 'Superior',
    12: 'Superior',
    13: 'Posgrado',
    14: 'Ninguno'
}

In [103]:
df['p6_id_ocupacion'] = df['p6_id_ocupacion'].map(d)
df['p5_id_nivel_educativo'] = df['p5_id_nivel_educativo'].map(dicc_nivel_ed)

In [113]:
df1 = prepare_dataframe(df, 'train')

In [114]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(['p13v_condujo_bicicleta'], axis=1), df1['p13v_condujo_bicicleta'], random_state=42)

In [117]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [118]:
clf.score(X_test, y_test)

0.9200918666349885

In [119]:
result = clf.predict(X_test)

In [120]:
(result == y_test.values).sum()

11618

In [121]:
(result == y_test.values)

array([ True,  True,  True, ...,  True,  True, False])

In [123]:
probas = clf.predict_proba(df1.drop(['p13v_condujo_bicicleta'], axis=1))

In [135]:
df1['proba'] = probas[..., 1]

In [25]:
X_test['prediccion'] = result
X_test['real'] = y_test

In [137]:
proba2 = clf.predict_proba(prepare_dataframe(df))

In [138]:
proba2.shape

(63244, 2)

In [140]:
df2['proba'] = proba2[..., 1]

In [150]:
df_personas_prob = pd.merge(df2, df_hogares[['Id_Hogar', 'zat_hogar']], left_on='id_hogar', right_on='Id_Hogar')

In [151]:
df_personas_prob['bici_prob'] = df_personas_prob.proba*df_personas.f_exp

Personas que viajaron en bicla (Modelo):

In [154]:
df_personas_prob.groupby('zat_hogar').sum().bici_prob.sum()

688835.876454457

Personas que viajaron en bicicleta (EODH): 

In [163]:
df_viajes.query('modo_principal == "Bicicleta"').groupby(['id_hogar', 'id_persona']).first().f_exp.sum()

562030.6208830468

Viajes totales en bicicleta (EODH):

In [164]:
df_viajes.query('modo_principal == "Bicicleta"').f_exp.sum()

1177867.731996233