In [1]:
# if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget

In [2]:
import os
import sys
import wget
import time
import tarfile
import zipfile
import random

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
DDIR = "data"
UDDIR = "user_data"
RESDIR = "prediction_result"

In [4]:
UID = "user_id"

# Load data (Only once)

In [5]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [6]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [7]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [8]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [9]:
def mynunique(values):
    return values.nunique(dropna=False)

In [10]:
def getidxmax(x):
    return x.idxmax()[1]

# Data Exploration

In [11]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [12]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")

In [13]:
train_ad["product_id"][0] == "\\N"

True

In [14]:
train_ad["product_id"] = train_ad["product_id"].replace({"\\N": np.nan})
test_ad["product_id"] = test_ad["product_id"].replace({"\\N": np.nan})

# Feature engineering

In [15]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()

In [16]:
train_feat[UID] = train_user[UID]
test_feat[UID] = np.sort(test_click_log[UID].unique())

In [17]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [18]:
tol_train.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,504423,30673.0,3,32638,319
1,65,30920,3072255,1,2642300,1261.0,2,6783,6
2,56,30920,2361327,1,2035918,1261.0,2,6783,6
3,6,309204,325532,1,292523,27081.0,3,32066,242
4,59,309204,2746730,1,2362208,,18,14682,88


## User Click log (Order 1)

In [19]:
# for train
tmp = tol_train.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
)

In [20]:
tmp.columns = bch_rencol(tmp.columns)
tmp.reset_index(inplace=True)
train_feat = pd.merge(train_feat, tmp, how="left", on=UID)

In [21]:
tmp = None

In [22]:
# for test
tmp = tol_test.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
)

In [23]:
tmp.columns = bch_rencol(tmp.columns)
tmp.reset_index(inplace=True)
test_feat = pd.merge(test_feat, tmp, how="left", on=UID)

In [24]:
tmp = None

In [25]:
train_feat.to_pickle(f"{UDDIR}/train_feat_order1.pkl")
test_feat.to_pickle(f"{UDDIR}/test_feat_order1.pkl")

## User Click log (Order 2)

In [26]:
tol_train.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,9,30920,567330,1,504423,30673.0,3,32638,319
1,65,30920,3072255,1,2642300,1261.0,2,6783,6
2,56,30920,2361327,1,2035918,1261.0,2,6783,6
3,6,309204,325532,1,292523,27081.0,3,32066,242
4,59,309204,2746730,1,2362208,,18,14682,88


In [27]:
train_feat_o2 = list()

In [28]:
OP_SET = ["max", "mean", "std", getidxmax, mynunique]

In [29]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    train_feat_o2.append(tmp)
    tmp = None

In [36]:
test_feat_o2 = list()

In [37]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    test_feat_o2.append(tmp)
    tmp = None

In [44]:
for fep in train_feat_o2:
    train_feat = pd.merge(train_feat, fep, how="left", on=UID)

In [46]:
for fep in test_feat_o2:
    test_feat = pd.merge(test_feat, fep, how="left", on=UID)

In [45]:
train_feat.shape

(900000, 46)

In [47]:
test_feat.shape

(1000000, 46)

In [66]:
train_feat.to_pickle(f"{UDDIR}/train_feat_order2.pkl")
test_feat.to_pickle(f"{UDDIR}/test_feat_order2.pkl")

In [49]:
del train_feat_o2, test_feat_o2

In [64]:
train_feat["product_id_count_getidxmax"] = train_feat["product_id_count_getidxmax"].replace("\\N", np.nan).astype(np.float)
train_feat["industry_count_getidxmax"] = train_feat["industry_count_getidxmax"].replace("\\N", np.nan).astype(np.float)

In [65]:
test_feat["product_id_count_getidxmax"] = test_feat["product_id_count_getidxmax"].replace("\\N", np.nan).astype(np.float)
test_feat["industry_count_getidxmax"] = test_feat["industry_count_getidxmax"].replace("\\N", np.nan).astype(np.float)

## Time Windows

In [30]:
for i in range(1, 92, 10):
    print(i)

1
11
21
31
41
51
61
71
81
91


In [None]:
tol_train[tol_train["time"] >=1 & tol_train["time"] <= 10]

In [None]:
trainproduct_id_count_getidxmax, industry_count_getidxmax

## Conact

In [67]:
train_all = pd.merge(train_feat, train_user, how="left", on=UID)

In [70]:
train_tag = train_all[[UID, "age", "gender"]]
train_feat = train_all.drop(["user_id", "age", "gender"], axis=1)

In [71]:
train_feat

Unnamed: 0,user_id_count,click_times_sum,click_times_max,click_times_mean,click_times_std,time_mynunique,time_mean,time_max,time_min,creative_id_mynunique,...,advertiser_id_count_max,advertiser_id_count_mean,advertiser_id_count_std,advertiser_id_count_getidxmax,advertiser_id_count_mynunique,industry_count_max,industry_count_mean,industry_count_std,industry_count_getidxmax,industry_count_mynunique
0,13,14,2,1.076923,0.277350,10,47.461538,76,20,12,...,2,1.083333,0.288675,188,2,3,1.444444,0.726483,326.0,3
1,45,46,2,1.022222,0.149071,28,53.822222,90,10,42,...,4,1.250000,0.603561,42272,3,9,3.000000,2.507133,6.0,7
2,30,30,1,1.000000,0.000000,23,50.266667,88,12,30,...,3,1.071429,0.377964,8371,2,8,3.750000,2.815772,322.0,5
3,29,29,1,1.000000,0.000000,15,57.724138,84,8,29,...,2,1.115385,0.325813,19056,2,9,2.900000,2.558211,,6
4,33,34,2,1.030303,0.174078,26,44.424242,87,3,33,...,2,1.100000,0.305129,13558,2,8,1.833333,1.757338,6.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,14,14,1,1.000000,0.000000,12,45.428571,75,8,13,...,2,1.166667,0.389249,6433,2,5,2.800000,1.483240,54.0,4
899996,18,20,2,1.111111,0.323381,14,61.166667,87,2,17,...,4,1.384615,0.869718,24274,3,4,1.800000,1.032796,321.0,4
899997,14,15,2,1.071429,0.267261,10,64.142857,91,25,14,...,5,1.555556,1.333333,14681,3,8,2.800000,3.033150,6.0,3
899998,22,22,1,1.000000,0.000000,17,39.954545,83,3,18,...,4,1.375000,0.806226,10690,3,4,1.571429,1.089410,6.0,3


# Training&Prediction

In [72]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_tag, test_size=0.2)

## For Age

In [73]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr, train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val, train_tag_val["age"]-1)

In [76]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [77]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.761489
[100]	valid_0's multi_error: 0.757567
[150]	valid_0's multi_error: 0.756022
[200]	valid_0's multi_error: 0.755661
[250]	valid_0's multi_error: 0.755033
[300]	valid_0's multi_error: 0.754439
[350]	valid_0's multi_error: 0.754233
[400]	valid_0's multi_error: 0.754433
[450]	valid_0's multi_error: 0.754322
Early stopping, best iteration is:
[366]	valid_0's multi_error: 0.754161


In [None]:
# offline

In [78]:
train_val_age_prob = model_lgb_multi_age.predict(train_feat_val, num_iteration=model_lgb_multi_age.best_iteration)

In [79]:
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]

In [80]:
accuracy_score(train_val_age_pred, train_tag_val["age"])

0.24583888888888888

In [87]:
0.7391055555555556+0.24583888888888888

0.9849444444444444

## For Gender

In [81]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr, train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val, train_tag_val["gender"]-1)

In [82]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [83]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.284928
[100]	valid_0's multi_error: 0.274089
[150]	valid_0's multi_error: 0.269372
[200]	valid_0's multi_error: 0.267539
[250]	valid_0's multi_error: 0.265611
[300]	valid_0's multi_error: 0.264172
[350]	valid_0's multi_error: 0.263706
[400]	valid_0's multi_error: 0.263494
[450]	valid_0's multi_error: 0.262978
[500]	valid_0's multi_error: 0.262711
[550]	valid_0's multi_error: 0.262739
[600]	valid_0's multi_error: 0.262439
[650]	valid_0's multi_error: 0.26245
[700]	valid_0's multi_error: 0.262222
[750]	valid_0's multi_error: 0.261772
[800]	valid_0's multi_error: 0.261806
[850]	valid_0's multi_error: 0.261522
[900]	valid_0's multi_error: 0.261222
[950]	valid_0's multi_error: 0.26105
[1000]	valid_0's multi_error: 0.261167
Did not meet early stopping. Best iteration is:
[961]	valid_0's multi_error: 0.260894


In [None]:
# offline

In [84]:
train_val_gender_prob = model_lgb_multi_gender.predict(train_feat_val, num_iteration=model_lgb_multi_gender.best_iteration)

In [85]:
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]

In [86]:
accuracy_score(train_val_gender_pred, train_tag_val["gender"])

0.7391055555555556

# Generate Prediction Result

In [88]:
res = test_feat[[UID]]

In [90]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat.drop("user_id", axis=1), num_iteration=model_lgb_multi_age.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [91]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat.drop("user_id", axis=1), num_iteration=model_lgb_multi_gender.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [92]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

# Cent result to COS

In [93]:
from ti import session
ti_session = session.Session()

In [94]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200509035457.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)