In [17]:
#coding: utf8

from sklearn.cross_validation import LeaveOneLabelOut

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
def scale_and_combine(df, scaler, cols_to_scale, cols_to_combine):
    df_new = df.copy()
    df_new[cols_to_scale] = scaler.transform(df[cols_to_scale].copy())
    for u_col in cols_to_combine:
        for o_col in cols_to_scale:
            new_name = u_col + ':' + o_col
            df_new[new_name] = df[u_col] * df_new[o_col]
    for u_col in cols_to_combine:
        del df_new[u_col]
    return df_new

In [3]:
def train_test_scale(df, y, users, cols_to_scale, cols_to_combine):
    for train, test in LeaveOneLabelOut(users):
        scaler = StandardScaler().fit(df.iloc[train][cols_to_scale].copy())
        df_new_train = scale_and_combine(df.iloc[train], scaler, cols_to_scale, cols_to_combine)
        df_new_test = scale_and_combine(df.iloc[test], scaler, cols_to_scale, cols_to_combine)
        yield df_new_train, df_new_test, y.iloc[train], y.iloc[test]

In [4]:
fpath = 'seg.dat'
df_orig = pd.read_csv(fpath, sep=' ')
df_user = pd.get_dummies(df_orig, columns=['age_cat', 'gender', 'inc_cat'], drop_first=False, prefix='user')
df_user_bairro = pd.get_dummies(df_user, columns=['bair_cat'], drop_first=False, prefix='bairro')
user_ids = df_user_bairro['userID'].copy()
del df_user_bairro['userID']
del df_user_bairro['bairro_mesmo']
del df_user_bairro['user_jovem']
del df_user_bairro['user_feminino']
del df_user_bairro['user_baixa']
df_user_bairro = sm.add_constant(df_user_bairro)
y = pd.read_csv('y_' + fpath)

In [5]:
user_cols = [x for x in df_user_bairro.columns if x.startswith('user_')]
bairro_cols = [x for x in df_user_bairro.columns if x.startswith('bairro_')]
other_cols = [x for x in df_user_bairro.columns if x.startswith('d_')]

In [6]:
scaler = StandardScaler().fit(df_user_bairro[other_cols].copy())
df_david = scale_and_combine(df_user_bairro, scaler, other_cols, user_cols)
model = sm.Logit(y, df_david)
fitted = model.fit()
fitted.summary()

Optimization terminated successfully.
         Current function value: 0.569632
         Iterations 6


0,1,2,3
Dep. Variable:,choice,No. Observations:,3529.0
Model:,Logit,Df Residuals:,3478.0
Method:,MLE,Df Model:,50.0
Date:,"Wed, 12 Oct 2016",Pseudo R-squ.:,0.1742
Time:,18:57:40,Log-Likelihood:,-2010.2
converged:,True,LL-Null:,-2434.3
,,LLR p-value:,1.3589999999999998e-145

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.3066,0.072,-4.272,0.000,-0.447 -0.166
d_swidth,-0.1797,0.119,-1.510,0.131,-0.413 0.054
d_mvcars,0.3397,0.114,2.977,0.003,0.116 0.563
d_pcars,0.1003,0.101,0.991,0.322,-0.098 0.299
d_trees,0.2027,0.130,1.558,0.119,-0.052 0.458
d_mvciclyst,0.0971,0.108,0.896,0.370,-0.115 0.310
d_lands,0.7759,0.120,6.491,0.000,0.542 1.010
d_bid,-0.0739,0.101,-0.728,0.466,-0.273 0.125
d_bheig,0.0497,0.102,0.487,0.626,-0.150 0.250


In [7]:
fitted.params[fitted.pvalues < 0.05]

const                     -0.306608
d_mvcars                   0.339652
d_lands                    0.775906
d_graff                   -0.238095
user_adulto:d_swidth       0.280239
user_adulto:d_dbuild       0.443095
user_adulto:d_people       0.578021
user_masculino:d_swidth    0.257427
user_masculino:d_mvcars   -0.327440
user_masculino:d_trees    -0.290441
user_masculino:d_people   -0.285174
user_media:d_pcars         0.237808
user_media:d_trees         0.242951
dtype: float64

In [22]:
def do_class(df, y, user_ids, cols_to_scale, cols_to_combine):
    y_pred_all = []
    y_true_all = []
    for df_train, df_test, y_train, y_test in train_test_scale(df, y, user_ids, cols_to_scale, cols_to_combine):
        model = GridSearchCV( \
            ExtraTreesClassifier(n_jobs=-1), \
            param_grid = {'min_samples_split': [2, 4, 8, 16, 32, 64, 128],
                      'min_samples_leaf': [2, 4, 8, 16, 32, 64, 128],
                      'n_estimators': [1, 2, 4, 16, 32, 64, 128]}, \
            cv=2)
        model.fit(df_train.values, y_train.values[:, 0])
        y_pred = model.predict(df_test.values)
        #model = sm.Logit(y_train, df_train)
        #fitted = model.fit()
        #y_pred = fitted.predict(df_test) >= 0.5
        y_pred_all.extend(y_pred)
        y_true_all.extend(y_test.values[:, 0])
    
    print(y_true_all)
    print(y_pred_all)
    print(classification_report(y_true_all, y_pred_all))
    print(accuracy_score(y_true_all, y_pred_all))
do_class(df_user_bairro, y, user_ids, other_cols, user_cols)

KeyboardInterrupt: 