In [2]:
import os
import csv
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

def get_recomendations(recommendations, test_data_loc, filename):
  '''
  This function takes recomendation, test data location and file name 
  creates a csv file in the submission file format

  recommendations : list of recomendations
  test_data_loc : location of test data
  filename : Name of the submission file
  '''
  test_data_file = csv.DictReader(open(test_data_loc)) #Reading the test data
  test_ids = np.array([int(row['ncodpers']) for row in test_data_file]) #Get the user ids of customers in test data
  fields = ['ncodpers', 'added_products'] #Column names
  rows = np.vstack((test_ids, np.array(recommendations))).T #Creating an array of test ids and recomendations

  with open(filename, 'w') as csvfile: #Creating a csv file with given name
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields) #Writing column names tothe file
    csvwriter.writerows(rows) #Writing the rows to the file

  print(filename, 'is created')

test_x = np.load("test_x.npy", allow_pickle=True)
test_x_with_code = np.load("test_x_with_code.npy", allow_pickle=True)
train_x = np.load("train_x.npy", allow_pickle=True)

# Y 是原始資料的商品(multi-label)
train_Y = np.load("train_y.npy", allow_pickle=True)
# y 是原始商品資料去做 argmax 取出的
train_y = np.zeros(len(train_Y))

df = pd.DataFrame(test_x_with_code)
code = df[7]

table = pd.read_csv('product.csv')
table = table.drop('product_name', axis=1)
table = table['product_id'].to_dict()

for idx, data in enumerate(train_Y):
    train_y[idx] = np.argmax(data)

valid_size = 1000000
print(f"original train_x shape: {train_x.shape}")
print(f"original train_y shape: {train_y.shape}\n")

valid_x = train_x[:valid_size]  # 100,000 samples for validation
valid_y = train_y[:valid_size]  # 100,000 samples for validation
valid_Y = train_Y[:valid_size]

train_x = train_x[valid_size:]
train_y = train_y[valid_size:]
train_Y = train_Y[valid_size:]

print(f"final train_x shape: {train_x.shape}")
print(f"final train_y shape: {train_y.shape}")
print(f"final valid_x shape: {valid_x.shape}")
print(f"final valid_y shape: {valid_y.shape}")
print(f"final test_x shape: {test_x.shape}")

original train_x shape: (13391269, 7)
original train_y shape: (13391269,)

final train_x shape: (12391269, 7)
final train_y shape: (12391269,)
final valid_x shape: (1000000, 7)
final valid_y shape: (1000000,)
final test_x shape: (929615, 7)


In [None]:
rf = RandomForestClassifier()

In [None]:
xgb_model = XGBClassifier(num_class=24,
                        random_state=0,
                        objective='multi:softmax',
                        learning_rate=0.1,
                        max_depth=3    
                        )

xgb_model.fit(train_x, train_y, verbose=1)
y_pred = xgb_model.predict(valid_x)
print(classification_report(valid_y, y_pred))

In [None]:
gbm_model = LGBMClassifier(learning_rate=0.05, 
                            n_estimators=100, 
                            random_state=0,
                            objective='multiclass',
                            num_class=24,
                            max_depth=3,
                            )

gbm_model.fit(train_x, train_y, verbose=1)
y_pred = gbm_model.predict(valid_x)
print(classification_report(valid_y, y_pred))

In [3]:
cat_model = CatBoostClassifier(iterations=150,
                           learning_rate=0.01,
                           depth=5,
                           loss_function='MultiLogloss',
                           random_seed=0)

cat_model.fit(train_x, train_Y, eval_set=(valid_x, valid_Y), verbose=1, plot=True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6845142	test: 0.6849254	best: 0.6849254 (0)	total: 21.5s	remaining: 53m 17s
1:	learn: 0.6760560	test: 0.6768498	best: 0.6768498 (1)	total: 46.6s	remaining: 57m 29s
2:	learn: 0.6677734	test: 0.6689672	best: 0.6689672 (2)	total: 1m 11s	remaining: 58m 28s
3:	learn: 0.6596442	test: 0.6612235	best: 0.6612235 (3)	total: 1m 37s	remaining: 59m 17s
4:	learn: 0.6516786	test: 0.6536459	best: 0.6536459 (4)	total: 2m 5s	remaining: 1h 27s
5:	learn: 0.6438576	test: 0.6461965	best: 0.6461965 (5)	total: 2m 33s	remaining: 1h 1m 19s
6:	learn: 0.6361817	test: 0.6388636	best: 0.6388636 (6)	total: 2m 59s	remaining: 1h 1m 1s
7:	learn: 0.6286586	test: 0.6317062	best: 0.6317062 (7)	total: 3m 24s	remaining: 1h 27s
8:	learn: 0.6212640	test: 0.6246644	best: 0.6246644 (8)	total: 3m 49s	remaining: 1h 2s
9:	learn: 0.6140065	test: 0.6177526	best: 0.6177526 (9)	total: 4m 17s	remaining: 1h 2s
10:	learn: 0.6068845	test: 0.6109724	best: 0.6109724 (10)	total: 4m 43s	remaining: 59m 41s
11:	learn: 0.5998893	test

90:	learn: 0.2888266	test: 0.3088202	best: 0.3088202 (90)	total: 48m 28s	remaining: 31m 25s
91:	learn: 0.2867076	test: 0.3068269	best: 0.3068269 (91)	total: 49m 9s	remaining: 30m 59s
92:	learn: 0.2846088	test: 0.3048469	best: 0.3048469 (92)	total: 49m 47s	remaining: 30m 30s
93:	learn: 0.2825400	test: 0.3028960	best: 0.3028960 (93)	total: 50m 25s	remaining: 30m 2s
94:	learn: 0.2804960	test: 0.3009695	best: 0.3009695 (94)	total: 51m 6s	remaining: 29m 35s
95:	learn: 0.2784808	test: 0.2990537	best: 0.2990537 (95)	total: 51m 44s	remaining: 29m 6s
96:	learn: 0.2764946	test: 0.2971727	best: 0.2971727 (96)	total: 52m 23s	remaining: 28m 37s
97:	learn: 0.2745302	test: 0.2953165	best: 0.2953165 (97)	total: 53m 3s	remaining: 28m 9s
98:	learn: 0.2725912	test: 0.2934895	best: 0.2934895 (98)	total: 53m 43s	remaining: 27m 40s
99:	learn: 0.2706844	test: 0.2916744	best: 0.2916744 (99)	total: 54m 20s	remaining: 27m 10s
100:	learn: 0.2687976	test: 0.2899002	best: 0.2899002 (100)	total: 54m 59s	remaining: 

In [81]:
y_pred = cat_model.predict(valid_x)
print(classification_report(valid_Y, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       177
           1       0.00      0.00      0.00        37
           2       0.81      0.80      0.81    757564
           3       0.00      0.00      0.00       589
           4       0.00      0.00      0.00    105695
           5       1.00      0.77      0.87     13539
           6       0.00      0.00      0.00      9657
           7       0.75      0.06      0.10    209892
           8       0.00      0.00      0.00     71597
           9       0.00      0.00      0.00      2007
          10       0.00      0.00      0.00      3095
          11       0.00      0.00      0.00     66580
          12       0.00      0.00      0.00    105612
          13       0.00      0.00      0.00     26899
          14       0.00      0.00      0.00      9859
          15       0.00      0.00      0.00     14446
          16       0.00      0.00      0.00      4116
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
y_pred_prob = cat_model.predict_proba(test_x)
print(y_pred_prob)

[[0.11150065 0.11143341 0.58545341 ... 0.3332142  0.3621869  0.50150399]
 [0.11126589 0.1112457  0.36245645 ... 0.11158269 0.11161931 0.11278237]
 [0.11124714 0.11124701 0.80868015 ... 0.15010091 0.15145947 0.21965541]
 ...
 [0.1113647  0.11128037 0.6695814  ... 0.21451197 0.2214628  0.36627236]
 [0.11130248 0.11124504 0.40700711 ... 0.11136857 0.11138935 0.11206975]
 [0.11132172 0.11125583 0.42485597 ... 0.12016158 0.12397419 0.13330625]]


In [72]:
k = 7
indice = np.zeros(7)
pred_prob_res = np.zeros((len(y_pred_prob), 7))
for idx, data in enumerate(y_pred_prob):
    indice = data.argsort()[-k:][::-1]
    pred_prob_res[idx] = indice
print(len(pred_prob_res))
print(pred_prob_res)
    

929615
[[ 2. 23. 12. ... 11.  7. 22.]
 [ 2. 12.  8. ...  6. 17.  4.]
 [ 2. 23.  4. ... 21. 12. 17.]
 ...
 [ 2. 23.  4. ...  7. 22. 21.]
 [ 2.  7.  8. ... 17.  6. 20.]
 [ 2.  7. 12. ... 23. 17.  4.]]


In [73]:
def map_dict(x):
    return table[x]

#pred_prob_res = pred_prob_res.astype(int)
print(pred_prob_res)

function = np.vectorize(map_dict)
pred_prob_res = function(pred_prob_res)

print(pred_prob_res)

[[ 2. 23. 12. ... 11.  7. 22.]
 [ 2. 12.  8. ...  6. 17.  4.]
 [ 2. 23.  4. ... 21. 12. 17.]
 ...
 [ 2. 23.  4. ...  7. 22. 21.]
 [ 2.  7.  8. ... 17.  6. 20.]
 [ 2.  7. 12. ... 23. 17.  4.]]
[['ind_cco_fin_ult1' 'ind_recibo_ult1' 'ind_ecue_fin_ult1' ...
  'ind_dela_fin_ult1' 'ind_ctop_fin_ult1' 'ind_nom_pens_ult1']
 ['ind_cco_fin_ult1' 'ind_ecue_fin_ult1' 'ind_ctpp_fin_ult1' ...
  'ind_ctma_fin_ult1' 'ind_reca_fin_ult1' 'ind_cno_fin_ult1']
 ['ind_cco_fin_ult1' 'ind_recibo_ult1' 'ind_cno_fin_ult1' ...
  'ind_nomina_ult1' 'ind_ecue_fin_ult1' 'ind_reca_fin_ult1']
 ...
 ['ind_cco_fin_ult1' 'ind_recibo_ult1' 'ind_cno_fin_ult1' ...
  'ind_ctop_fin_ult1' 'ind_nom_pens_ult1' 'ind_nomina_ult1']
 ['ind_cco_fin_ult1' 'ind_ctop_fin_ult1' 'ind_ctpp_fin_ult1' ...
  'ind_reca_fin_ult1' 'ind_ctma_fin_ult1' 'ind_viv_fin_ult1']
 ['ind_cco_fin_ult1' 'ind_ctop_fin_ult1' 'ind_ecue_fin_ult1' ...
  'ind_recibo_ult1' 'ind_reca_fin_ult1' 'ind_cno_fin_ult1']]


In [74]:
# pred_prob_res = pred_prob_res.tolist()
prediction = list()
for i in pred_prob_res:
    a = list()
    a.append(i)
    prediction.append(a)

print(prediction)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [75]:
prediction = pd.DataFrame(prediction)

print(prediction)

result = pd.concat([code, prediction], axis=1)

print(result)

                                                        0
0       [ind_cco_fin_ult1, ind_recibo_ult1, ind_ecue_f...
1       [ind_cco_fin_ult1, ind_ecue_fin_ult1, ind_ctpp...
2       [ind_cco_fin_ult1, ind_recibo_ult1, ind_cno_fi...
3       [ind_cco_fin_ult1, ind_reca_fin_ult1, ind_ecue...
4       [ind_cco_fin_ult1, ind_reca_fin_ult1, ind_ecue...
...                                                   ...
929610  [ind_cco_fin_ult1, ind_recibo_ult1, ind_ecue_f...
929611  [ind_cco_fin_ult1, ind_ctop_fin_ult1, ind_ctpp...
929612  [ind_cco_fin_ult1, ind_recibo_ult1, ind_cno_fi...
929613  [ind_cco_fin_ult1, ind_ctop_fin_ult1, ind_ctpp...
929614  [ind_cco_fin_ult1, ind_ctop_fin_ult1, ind_ecue...

[929615 rows x 1 columns]
              7                                                  0
0         15889  [ind_cco_fin_ult1, ind_recibo_ult1, ind_ecue_f...
1       1170544  [ind_cco_fin_ult1, ind_ecue_fin_ult1, ind_ctpp...
2       1170545  [ind_cco_fin_ult1, ind_recibo_ult1, ind_cno_fi...
3       1

In [76]:
for idx, data in enumerate(result[0]):
    s = ''
    for i in data:
        s += i
        s += ' '
    print(s)
    result[0][idx] = s

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [77]:
print(result.head())

         7                                                  0
0    15889  ind_cco_fin_ult1 ind_recibo_ult1 ind_ecue_fin_...
1  1170544  ind_cco_fin_ult1 ind_ecue_fin_ult1 ind_ctpp_fi...
2  1170545  ind_cco_fin_ult1 ind_recibo_ult1 ind_cno_fin_u...
3  1170547  ind_cco_fin_ult1 ind_reca_fin_ult1 ind_ecue_fi...
4  1170548  ind_cco_fin_ult1 ind_reca_fin_ult1 ind_ecue_fi...


In [78]:
result.to_csv('predict.csv', index=False)