In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
np.random.seed(2018)

In [2]:
trn = pd.read_csv('train_ver2.csv')
tst = pd.read_csv('test_ver2.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
prods = trn.columns[24:].tolist()
trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

In [4]:
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

In [5]:
for col in trn.columns[24:]:
    tst[col] = 0
df = pd.concat ([trn, tst], axis=0)

In [6]:
features = []
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall','tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols


In [7]:
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

In [8]:
df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

In [9]:
df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)


In [10]:
df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)


In [11]:
features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'indrel_1mes', 'ind_actividad_cliente']

In [12]:
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)

In [13]:
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)

In [14]:
features += ['fecha_alta_month', 'fecha_alta_year']

In [15]:
df['ult_fec_cli_lt_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_lt_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_lt_month', 'ult_fec_cli_lt_year']

In [16]:
df.fillna(-99, inplace=True)

In [17]:
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split('-')]
    int_date = (int(Y)-2015)*12 + int(M)
    return int_date

In [18]:
df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

In [19]:
df_lag = df.copy()
df_lag.columns = [col +'_prev' if col not in ['ncodpers', 'int_date'] else col for col in df_lag.columns]

In [20]:
df_lag['int_date'] += 1
df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')
del df, df_lag

In [21]:
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

features += [feature + '_prev' for feature in features]
features += [prod +'_prev' for prod in prods]

In [22]:
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn

In [23]:
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)

In [24]:
XY = pd.concat(X)
Y = np.hstack(Y)

In [25]:
XY['y'] = Y


In [27]:
vld_date = '2016-04-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

In [28]:
params = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018
}
X_trn = XY_trn[features].to_numpy()
Y_trn = XY_trn['y'].to_numpy()
dtrn= xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)
X_vld = XY_vld[features].to_numpy()
Y_vld = XY_vld['y'].to_numpy()


In [29]:
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
watch_list = [(dtrn, 'train'), (dvld, 'eval')]

In [None]:
model = xgb.train(params, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

import pickle
pickle.dump(model, open('model.pkl', 'wb'))
best_ntree_limit = model.best_ntree_limit


In [56]:
vld = trn[trn['fecha_dato'] == vld_date]


In [57]:
ncodpers_vld = vld['ncodpers'].to_numpy()
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]
add_vld = vld[[prod + '_add' for prod in prods]].to_numpy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vld[padd] = vld[prod] - vld[prev]


In [58]:
add_vld_list = [list() for i in range(len(ncodpers_vld))]

In [59]:
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1


In [60]:
import mapk

mapk.mapk(add_vld_list, add_vld_list,7,0.0)

0.041302421141683116

In [61]:
X_vld = vld[features].to_numpy()
#Y_vld = vld['y'].to_numpy()
#dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
dvld = xgb.DMatrix(X_vld,  feature_names=features)
preds = model.predict(dvld, ntree_limit=best_ntree_limit)



In [62]:
result_vld=[]
for ncodper, pred in zip(ncodpers_vld, preds):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda x: x[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])


In [63]:
print(mapk.mapk(add_vld_list, result_vld,7,0.0))

0.03456823020134479
