In [None]:
! wget https://andres-comp-datasets.s3.us-east-2.amazonaws.com/aws_Processed2.tar.gz
! tar -xzvf aws_Processed2.tar.gz
! mv Processed2.csv ../csvs/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

df = pd.read_csv('../csvs/Processed2.csv',sep=';',nrows=100000)

categorical = ['genero','ciiu_categ','work_mobility','tenencia_tc','has_tc_cupo0','rechazo_credito',
               'tiene_crediagil','tiene_consumo','tiene_ctas_activas','tiene_ctas_embargadas',
               'tiene_cred_hipo_1','cat_ingreso','rep_calif_cred','profesion_cat','estado_civil',
               'nivel_academico','profesion','ocupacion','tipo_vivienda','nro_tot_cuentas', 
               'ctas_activas','ctas_embargadas','codigo_ciiu']

categ_num = []

# Rename columns so that no two columns have the same name
col_names = []
for i,col in enumerate(df.columns):
    if col not in ['train','gasto_familiar','gasto_familiar_log']:
        col_names.append(str(col) + f'_{i}')
        if col in categorical:
            categ_num.append(str(col) + f'_{i}')
            
    else: col_names.append(col)
        
df.columns = col_names

### Start generating mean encodings

I included 'nro_tot_cuentas', 'ctas_activas','ctas_embargadas' as categorical variables as these seem to correlate good with target.

In [2]:
# Train/val split
from sklearn.model_selection import train_test_split

train = df[(df.train == 1) & (~df.gasto_familiar_log.isna()) & (df.gasto_familiar_log!=0)].copy()
test = df[df.train == 0].copy()

train,valid = train_test_split(train,test_size=0.05)

In [3]:
# Estimate encodings on X_train and map to X_valid

glob_mean_log = train.gasto_familiar_log.mean()
glob_mean = train.gasto_familiar.mean()

for col in categ_num:
    valid[col+'_mean_enc'] = (valid[col]
                              .map(train.groupby(col)
                                   .gasto_familiar.mean())
                              .fillna(glob_mean))
    
    valid[col+'_mean_enc_log'] = (valid[col]
                                  .map(train.groupby(col)
                                       .gasto_familiar_log.mean())
                                  .fillna(glob_mean_log))

In [4]:
# Regularize X_train
from sklearn.model_selection import KFold
kf = KFold(5)

for col in categ_num:
    train[col+'_mean_enc'] = glob_mean
    train[col+'_mean_enc_log'] = glob_mean_log
    
for i,(idx_train, idx_valid) in enumerate(kf.split(train)):
    print(f"Fold {i}...")
    for col in categ_num:
        # Normal space
        train_mean = (train
                      .iloc[idx_train]
                      .groupby(col)
                      .gasto_familiar.mean())

        train.iloc[idx_valid].loc[col+'_mean_enc'] = (train.iloc[idx_valid]
                                                      .loc[:,col].map(train_mean))
        # Log transform
        train_mean = (train
                      .iloc[idx_train]
                      .groupby(col)
                      .gasto_familiar_log.mean())

        train.iloc[idx_valid].loc[col+'_mean_enc_log'] = (train.iloc[idx_valid]
                                                          .loc[:,col].map(train_mean))


Fold 0...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Fold 1...
Fold 2...
Fold 3...
Fold 4...


In [None]:
# Train and validate model

# Remove any columns that are not numeric
train = train.loc[:,(train.dtypes!='object')&(train.dtypes!='category')]
valid = valid.loc[:,(valid.dtypes!='object')&(valid.dtypes!='category')]

# Separate X from y
y_train = train.gasto_familiar_log
y_valid = valid.gasto_familiar_log

# Drop target variable from X
X_train = train.drop(['gasto_familiar','gasto_familiar_log','train'],axis=1)
X_valid = valid.drop(['gasto_familiar','gasto_familiar_log','train'],axis=1)

In [None]:
# xgboost model
import xgboost as xgb

dmat_train = xgb.DMatrix(X_train,label=y_train)
dmat_valid = xgb.DMatrix(X_valid,label=y_valid)

def mape_obj1(preds, dtrain):
    labels = dtrain.get_label()
    grad = (preds-labels)/(0.25+labels*np.abs(preds-labels))
    hess = 0.0002*np.ones(len(preds));
    return grad,hess

def mape(preds, dmat):
    actuals = np.exp(dmat.get_label())-1
    preds = np.exp(preds)-1
    err = 100.*np.mean(np.abs(actuals - preds)/actuals)
    return "MAPE", err

In [None]:
booster = xgb.train({"booster":"gbtree",'eta': 0.003,'max_depth': 20,
    'subsample': 1.0,'colsample_bytree': 0.8,'colsample_bylevel': 0.05,"alpha": 0.0,
    "lambda": 0.5,'seed': 1,"gamma":0.0,"max_delta_step":0,'nthread':4,
    'disable_default_eval_metric': 1},
                    dmat_train,
                    evals=[(dmat_train, "train"),(dmat_valid, "valid")],
                    feval=mape,
                    num_boost_round=800,
                    early_stopping_rounds=3000,
                    obj=mape_obj1,
                   )#xgb_model=booster)

In [1]:
import pandas as pd
import numpy as np

newdf = pd.read_csv('../csvs/processedDS.csv',sep=';',nrows=300000)

newdf

Unnamed: 0,cant_cast_ult_12m_sr,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cant_oblig_tot_sf,categoria,ctas_activas,ctas_embargadas,cuota_cred_hipot,cuota_de_consumo,...,cupo_tc_mdo_log,saldo_prom3_tdc_mdo_log,cuota_tc_mdo_log,saldo_no_rot_mdo_log,cuota_libranza_sf_log,cant_oblig_tot_sf_log,cant_cast_ult_12m_sr_log,edad_log,ciudad_laboral_freq,departamento_laboral_freq
0,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,,1484000.0,...,16.118096,16.045095,13.348704,18.406562,0.09531,1.410987,0.09531,4.140786,346595,284992
1,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,,1484000.0,...,16.118096,16.045095,13.348704,18.406562,0.09531,1.410987,0.09531,4.137907,346595,284992
2,0.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,0.0,,0.0,...,0.095310,0.095310,0.095310,11.385105,0.09531,0.095310,0.09531,4.077539,346595,284992
3,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,,0.0,...,0.095310,0.095310,0.095310,10.950826,0.09531,0.095310,0.09531,4.114168,346595,284992
4,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,,0.0,...,0.095310,0.095310,0.095310,0.095310,0.09531,0.095310,0.09531,4.112915,346595,284992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,,0.0,...,0.095310,0.095310,0.095310,10.736421,0.09531,0.741937,0.09531,3.614355,346595,89426
299996,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,,502000.0,...,0.095310,0.095310,0.095310,12.095147,0.09531,1.410987,0.09531,3.577858,346595,89426
299997,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,,0.0,...,0.095310,0.095310,0.095310,0.095310,0.09531,0.741937,0.09531,3.596048,346595,89426
299998,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,,0.0,...,13.845070,0.095310,0.095310,0.095310,0.09531,1.410987,0.09531,3.743083,346595,89426


In [1]:
import pandas as pd
import numpy as np

newdf = pd.read_csv('../csvs/processedDS.csv',sep=';',nrows=300000)

import datetime as dt

# Convert datetimes to numerical
for col in ['periodo_date', 'date_ult_actual', 'periodo_dt']:
    newdf[col] = ((pd.to_datetime(newdf[col]) - dt.datetime(1970, 1, 1)).dt.total_seconds()-1.56e9)/1e6
    
newdf = newdf.fillna(-1)

# Train/valid/test split
from sklearn.model_selection import train_test_split

test = newdf[newdf.train == 0]
train = newdf[(newdf.train == 1) & (newdf.gasto_familiar != 0.)]

train_x = train.drop(['gasto_familiar','gasto_familiar_log','train'],axis=1)
col_names = [str(i) for i in range(len(train_x.columns))]
train_x.columns = col_names
                     
X_train,X_valid,y_train,y_valid = train_test_split(train_x,
                                                   train.gasto_familiar,test_size=0.1)

X_test = test.drop(['gasto_familiar','gasto_familiar_log','train'],axis=1)

X_test.columns = col_names
y_test = test.gasto_familiar

In [2]:
# Start with xgboost 
import xgboost as xgb

In [70]:
dmat_train = xgb.DMatrix(X_train,label=np.log(1+y_train))
dmat_valid = xgb.DMatrix(X_valid,label=np.log(1+y_valid))

In [104]:
def mape_obj1(preds, dtrain):
    labels = dtrain.get_label()
    grad = (preds-labels)/(0.25+labels*np.abs(preds-labels))
    hess = 0.0002*np.ones(len(preds));
    return grad,hess

def mape(preds, dmat):
    actuals = np.exp(dmat.get_label())-1
    preds = np.exp(preds)-1
    err = 100.*np.mean(np.abs(actuals - preds)/actuals)
    return "MAPE", err

In [118]:
booster = xgb.train({'objective' : 'reg:linear',
        "booster":"gbtree",'eta': 0.0005,'max_depth': 15,'min_child_weight': 0.01,
    'subsample': 1.0,'colsample_bytree': 0.8,'colsample_bylevel': 0.05,"alpha": 0.0,
    "lambda": 0.5,'seed': 1,"gamma":0.0,"max_delta_step":0,'nthread':4,
    'disable_default_eval_metric': 1},
                    dmat_train,
                    evals=[(dmat_train, "train"),(dmat_valid, "valid")],
                    feval=mape,
                    num_boost_round=500,
                    early_stopping_rounds=3000,
                    obj=mape_obj1,
                    xgb_model=booster)

[0]	train-MAPE:60.85319	valid-MAPE:2268.83125
[1]	train-MAPE:60.89107	valid-MAPE:2247.56031
[2]	train-MAPE:60.81445	valid-MAPE:2249.66145
[3]	train-MAPE:60.76548	valid-MAPE:2253.80631
[4]	train-MAPE:60.75419	valid-MAPE:2246.08345
[5]	train-MAPE:60.68096	valid-MAPE:2235.79941
[6]	train-MAPE:60.64209	valid-MAPE:2220.33749
[7]	train-MAPE:60.61218	valid-MAPE:2213.65623
[8]	train-MAPE:60.56128	valid-MAPE:2216.70666
[9]	train-MAPE:60.52537	valid-MAPE:2221.85707
[10]	train-MAPE:60.49563	valid-MAPE:2217.04292
[11]	train-MAPE:60.45496	valid-MAPE:2203.79353
[12]	train-MAPE:60.43264	valid-MAPE:2200.52662
[13]	train-MAPE:60.41509	valid-MAPE:2196.11874
[14]	train-MAPE:60.39808	valid-MAPE:2193.38341
[15]	train-MAPE:60.36592	valid-MAPE:2192.73891
[16]	train-MAPE:60.33322	valid-MAPE:2183.97274
[17]	train-MAPE:60.31517	valid-MAPE:2176.37959
[18]	train-MAPE:60.30320	valid-MAPE:2170.78609
[19]	train-MAPE:60.27160	valid-MAPE:2167.78965
[20]	train-MAPE:60.24776	valid-MAPE:2164.64005
[21]	train-MAPE:60.2328

[173]	train-MAPE:57.10373	valid-MAPE:2137.61978
[174]	train-MAPE:57.08488	valid-MAPE:2136.62357
[175]	train-MAPE:57.06844	valid-MAPE:2142.01279
[176]	train-MAPE:57.03986	valid-MAPE:2139.89296
[177]	train-MAPE:57.03688	valid-MAPE:2147.79892
[178]	train-MAPE:57.00897	valid-MAPE:2144.13185
[179]	train-MAPE:56.98954	valid-MAPE:2145.87231
[180]	train-MAPE:56.95540	valid-MAPE:2149.87640
[181]	train-MAPE:56.94409	valid-MAPE:2151.41468
[182]	train-MAPE:56.91084	valid-MAPE:2145.50934
[183]	train-MAPE:56.90649	valid-MAPE:2140.49912
[184]	train-MAPE:56.87837	valid-MAPE:2149.04728
[185]	train-MAPE:56.86337	valid-MAPE:2148.82927
[186]	train-MAPE:56.83525	valid-MAPE:2152.07176
[187]	train-MAPE:56.81489	valid-MAPE:2146.72222
[188]	train-MAPE:56.78881	valid-MAPE:2146.19255
[189]	train-MAPE:56.77941	valid-MAPE:2143.49117
[190]	train-MAPE:56.75231	valid-MAPE:2146.90628
[191]	train-MAPE:56.74077	valid-MAPE:2152.32983
[192]	train-MAPE:56.70664	valid-MAPE:2147.88017
[193]	train-MAPE:56.68292	valid-MAPE:214

[344]	train-MAPE:53.89873	valid-MAPE:2104.53339
[345]	train-MAPE:53.86637	valid-MAPE:2109.09748
[346]	train-MAPE:53.85806	valid-MAPE:2109.43260
[347]	train-MAPE:53.82624	valid-MAPE:2112.63676
[348]	train-MAPE:53.81153	valid-MAPE:2109.57279
[349]	train-MAPE:53.78728	valid-MAPE:2106.36177
[350]	train-MAPE:53.77249	valid-MAPE:2102.65274
[351]	train-MAPE:53.75623	valid-MAPE:2112.21390
[352]	train-MAPE:53.74795	valid-MAPE:2111.98025
[353]	train-MAPE:53.72540	valid-MAPE:2109.08299
[354]	train-MAPE:53.72001	valid-MAPE:2110.69355
[355]	train-MAPE:53.68713	valid-MAPE:2106.09379
[356]	train-MAPE:53.69839	valid-MAPE:2108.04405
[357]	train-MAPE:53.67854	valid-MAPE:2107.22980
[358]	train-MAPE:53.66126	valid-MAPE:2101.86882
[359]	train-MAPE:53.62814	valid-MAPE:2105.59788
[360]	train-MAPE:53.62346	valid-MAPE:2107.17373
[361]	train-MAPE:53.60331	valid-MAPE:2097.86072
[362]	train-MAPE:53.57959	valid-MAPE:2093.40210
[363]	train-MAPE:53.56578	valid-MAPE:2098.04192
[364]	train-MAPE:53.54886	valid-MAPE:209