In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

np.random.seed(42)

In [105]:
trn = pd.read_csv('./data/train_ver2.csv')
tst = pd.read_csv('./data/test_ver2.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [106]:
# 제품 변수들의 결측값을 0으로 대체한다.
prods = trn.columns[24:].tolist()
trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

# 모든 제품이 0인 행 제거
no_product = trn[prods].sum(axis=1) == 0
trn = trn[-no_product]

# train / test 데이터를 합친다. test 데이터의 제품 부분은 모두 0으로 채운다.
for col in trn.columns[24:]:
    tst[col] = 0
df = pd.concat([trn, tst], axis=0)

# 학습에 사용할 변수를 담는 리스트
features = []

# 번주형 변수들을 label encoding한다.
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes',
                    'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall',
                    'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols


# 수치형 변수들의 특이값과 결측값을 -99로 대체하고 int형으로 바꾼다.
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

# 학습에 사용할 수치형 변수를 추가한다.
features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel',
             'indrel_1mes', 'ind_actividad_cliente']

## 피쳐 엔지니어링

In [107]:
# 두 날짜 변수에서 연도와 월 정보를 추출한다.
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float
                                              else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float
                                              else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']

df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float
                                              else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float
                                              else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

# 그 외 변수들의 결측값을 -99로 대체한다.
df.fillna(-99, inplace=True)

# lag - 1 데이터를 생성한다.

def date_to_int(str_data):
    Y, M, D = [int(a) for a in str_data.strip().split('-')]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date']
                   else col for col in df.columns]
df_lag['int_date'] += 1

df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')

# del df, df_lag

for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

## 모델 학습

In [108]:
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
# del df_trn

# 훈련 데이터에서 신규 구매 건수만 추출한다.
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)

XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

# 훈련, 검증 데이터로 분리한다. 
vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

In [98]:
XY['y']

10597872     0
7658069      1
7628180      2
7628198      2
7628482      2
            ..
11090455    23
11090468    23
11090667    23
11090696    23
11090782    23
Name: y, Length: 199034, dtype: int8

In [33]:
# XGBoost 모델 parameter를 설정한다.
param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    }

# 훈련, 검증 데이터를 XGBoost 형태로 변환한다.
X_trn = XY_trn[features].values
Y_trn = XY_trn['y'].values
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

X_vld = XY_vld[features].values
Y_vld = XY_vld['y'].values
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

# XGBoost 모델을 훈련 데이터로 학습한다!
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

# 학습한 모델을 저장한다.
import pickle
pickle.dump(model, open("./model/xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:2.72501	eval-mlogloss:2.73551
[1]	train-mlogloss:2.45697	eval-mlogloss:2.47072
[2]	train-mlogloss:2.27435	eval-mlogloss:2.28931
[3]	train-mlogloss:2.13564	eval-mlogloss:2.15204
[4]	train-mlogloss:2.02201	eval-mlogloss:2.03829
[5]	train-mlogloss:1.92567	eval-mlogloss:1.94298
[6]	train-mlogloss:1.84739	eval-mlogloss:1.86529
[7]	train-mlogloss:1.77948	eval-mlogloss:1.79788
[8]	train-mlogloss:1.71764	eval-mlogloss:1.73641
[9]	train-mlogloss:1.66492	eval-mlogloss:1.68415
[10]	train-mlogloss:1.61811	eval-mlogloss:1.63771
[11]	train-mlogloss:1.57482	eval-mlogloss:1.59515
[12]	train-mlogloss:1.53556	eval-mlogloss:1.55617
[13]	train-mlogloss:1.50161	eval-mlogloss:1.52287
[14]	train-mlogloss:1.47083	

[157]	train-mlogloss:1.00202	eval-mlogloss:1.08743
[158]	train-mlogloss:1.00143	eval-mlogloss:1.08740
[159]	train-mlogloss:1.00078	eval-mlogloss:1.08729
[160]	train-mlogloss:1.00035	eval-mlogloss:1.08729
[161]	train-mlogloss:0.99988	eval-mlogloss:1.08728
[162]	train-mlogloss:0.99934	eval-mlogloss:1.08731
[163]	train-mlogloss:0.99871	eval-mlogloss:1.08726
[164]	train-mlogloss:0.99818	eval-mlogloss:1.08725
[165]	train-mlogloss:0.99774	eval-mlogloss:1.08723
[166]	train-mlogloss:0.99712	eval-mlogloss:1.08715
[167]	train-mlogloss:0.99661	eval-mlogloss:1.08712
[168]	train-mlogloss:0.99606	eval-mlogloss:1.08709
[169]	train-mlogloss:0.99559	eval-mlogloss:1.08702
[170]	train-mlogloss:0.99505	eval-mlogloss:1.08702
[171]	train-mlogloss:0.99447	eval-mlogloss:1.08697
[172]	train-mlogloss:0.99388	eval-mlogloss:1.08696
[173]	train-mlogloss:0.99324	eval-mlogloss:1.08693
[174]	train-mlogloss:0.99274	eval-mlogloss:1.08687
[175]	train-mlogloss:0.99216	eval-mlogloss:1.08682
[176]	train-mlogloss:0.99160	ev

In [110]:
from mapk import mapk

# MAP@7 평가 척도를 위한 준비작업이다.
# 고객 식별 번호를 추출한다.
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld['ncodpers'].values

# 검증 데이터에서 신규 구매를 구한다.
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]
add_vld = vld[[prod + '_add' for prod in prods]].values
add_vld_list = [list() for i in range(len(ncodpers_vld))]

# 고객별 신규 구매 정답 값을 add_vld_list에 저장하고, 총 count를 count_vld에 저장한다.
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1

# 검증 데이터에서 얻을 수 있는 MAP@7 최고점을 미리 구한다. (0.042663)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

# 검증 데이터에 대한 예측 값을 구한다.
X_vld = vld[features].values
dvld = xgb.DMatrix(X_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)

# 저번 달에 보유한 제품은 신규 구매가 불가하기 때문에, 확률값에서 미리 1을 빼준다
preds_vld = preds_vld - vld[[prod + '_prev' for prod in prods]].values

# 검증 데이터 예측 상위 7개를 추출한다.
result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])
    
# 검증 데이터에서의 MAP@7 점수를 구한다. (0.036483)
print(mapk(add_vld_list, result_vld, 7, 0.0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vld[padd] = vld[prod] - vld[prev]


0.04266379915553903
0.03648288874172132


## 전체 데이터 학습 후 제출 파일 만들기

In [138]:
# XGBoost 모델을 전체 훈련 데이터로 재학습한다!
X_all = XY[features].values
Y_all = XY['y'].values
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]
# 트리 개수를 늘어난 데이터 양만큼 비례해서 증가한다.
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))
# XGBoost 모델 재학습!
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

# 변수 중요도를 출력해본다. 예상하던 변수가 상위로 올라와 있는가?
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)

# 캐글 제출을 위하여 테스트 데이터에 대한 예측 값을 구한다.
X_tst = tst[features].values
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst['ncodpers'].values
preds_tst = preds_tst - tst[[prod + '_prev' for prod in prods]].values

# 제출 파일을 생성한다.
submit_file = open('./model/xgb.baseline.2015-06-28', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:2.72653
[1]	train-mlogloss:2.45788
[2]	train-mlogloss:2.27489
[3]	train-mlogloss:2.13560
[4]	train-mlogloss:2.02158
[5]	train-mlogloss:1.92519
[6]	train-mlogloss:1.84686
[7]	train-mlogloss:1.77856
[8]	train-mlogloss:1.71674
[9]	train-mlogloss:1.66394
[10]	train-mlogloss:1.61715
[11]	train-mlogloss:1.57391
[12]	train-mlogloss:1.53472
[13]	train-mlogloss:1.50084
[14]	train-mlogloss:1.47000
[15]	train-mlogloss:1.44046
[16]	train-mlogloss:1.41425
[17]	train-mlogloss:1.39089
[18]	train-mlogloss:1.36909
[19]	train-mlogloss:1.34917
[20]	train-mlogloss:1.33089
[21]	train-mlogloss:1.31322
[22]	train-mlogloss:1.29747
[23]	train-mlogloss:1.28293
[24]	train-mlogloss:1.26957
[25]	train-mlogloss:1.25662


[276]	train-mlogloss:0.95729
[277]	train-mlogloss:0.95700
[278]	train-mlogloss:0.95666
[279]	train-mlogloss:0.95624
[280]	train-mlogloss:0.95592
[281]	train-mlogloss:0.95565
[282]	train-mlogloss:0.95528
[283]	train-mlogloss:0.95486
[284]	train-mlogloss:0.95460
[285]	train-mlogloss:0.95415
[286]	train-mlogloss:0.95369
[287]	train-mlogloss:0.95337
[288]	train-mlogloss:0.95289
[289]	train-mlogloss:0.95236
[290]	train-mlogloss:0.95189
[291]	train-mlogloss:0.95153
[292]	train-mlogloss:0.95122
[293]	train-mlogloss:0.95092
[294]	train-mlogloss:0.95053
[295]	train-mlogloss:0.95016
[296]	train-mlogloss:0.94969
[297]	train-mlogloss:0.94924
[298]	train-mlogloss:0.94895
[299]	train-mlogloss:0.94865
[300]	train-mlogloss:0.94822
[301]	train-mlogloss:0.94773
[302]	train-mlogloss:0.94729
[303]	train-mlogloss:0.94691
[304]	train-mlogloss:0.94656
[305]	train-mlogloss:0.94628
[306]	train-mlogloss:0.94593
[307]	train-mlogloss:0.94561
[308]	train-mlogloss:0.94529
[309]	train-mlogloss:0.94494
[310]	train-ml