In [None]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget

In [None]:
import os
import sys
import gc
import wget
import time
import tarfile
import zipfile
import random
import copy
from tqdm import tqdm_notebook, tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v03"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [None]:
UID = "user_id"

# Load data (Only once)

In [None]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [None]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [None]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [None]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [None]:
def mynunique(values):
    return values.nunique(dropna=False)

In [None]:
def getidxmax(x):
    return x.idxmax()[1]

In [None]:
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_


In [None]:
OP_SET = ["sum", "max", "min", "mean", "std", getidxmax, "nunique"]

# Data Exploration (todo)

In [None]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")

In [None]:
train_click_log.shape

In [None]:
train_ad.shape

In [None]:
train_user.shape

In [None]:
test_click_log.shape

In [None]:
test_ad.shape

In [None]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
# creative id in train (creative id is unique in train_ad)
len(train_ad)

In [None]:
# creative id in test (creative id is unique in test_ad)
len(test_ad)

In [None]:
# check whether the same creative_id in train and test have same ad info
insect1d = np.intersect1d(train_click_log.creative_id.unique(), test_click_log.creative_id.unique())
print("Same creative id: ", insect1d.shape)
print("Diff number: ", np.sum(train_ad[train_ad.creative_id.isin(insect1d)].values != test_ad[test_ad.creative_id.isin(insect1d)].values))
# checked: they all have same ad info (result is 0)

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(train_click_log.creative_id.unique(), train_ad.creative_id))
print("Diff list: ", np.setdiff1d(train_ad.creative_id, train_click_log.creative_id.unique()))

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(test_click_log.creative_id.unique(), test_ad.creative_id))
print("Diff list: ", np.setdiff1d(test_ad.creative_id, test_click_log.creative_id.unique()))

In [None]:
# click time
sns.lineplot(x=train_click_log.time.value_counts().index, y=train_click_log.time.value_counts())

In [None]:
sns.lineplot(x=test_click_log.time.value_counts().index, y=test_click_log.time.value_counts())

In [None]:
# data_grouped = data.groupby(data.index)
# results = Parallel(n_jobs=8)(delayed(key_func)(group) for name, group in data_grouped)
# data = pd.concat(results)

# Feature engineering

In [None]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
del train_click_log, train_ad
del test_click_log, test_ad

In [None]:
gc.collect()

## Creative id map (bad)

In [None]:
train_ucid = pd.merge(tol_train[[UID, "creative_id"]], train_user, how="left", on=UID)

In [None]:
cid_map = train_ucid.groupby(["creative_id"])[["age", "gender"]].agg(["mean"])
cid_map.columns = bch_rencol(cid_map.columns, prefix="creative_id_")

In [None]:
tol_train = pd.merge(train_click_log, cid_map, how="left", on="creative_id")

In [None]:
tol_test = pd.merge(test_click_log, cid_map, how="left", on="creative_id")

In [None]:
del cid_map

In [None]:
tmp = tol_train.groupby([UID])["creative_id_age_mean", "creative_id_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_o1_cid_map.pkl")
tmp = None

In [None]:
tmp = tol_test.groupby([UID])["creative_id_age_mean", "creative_id_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_o1_cid_map.pkl")
tmp = None

## User Click log (Order 1)

In [None]:
ops_dic = {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std", at_nunq, at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
        "time": ["nunique", "mean", "max", "min", at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
        "creative_id": ["nunique", at_lenDnunq],
        "ad_id": ["nunique", at_lenDnunq],
        "product_id": ["nunique", at_lenDnunq],
        "product_category": ["nunique", at_lenDnunq],
        "advertiser_id": ["nunique", at_lenDnunq],
        "industry": ["nunique", at_lenDnunq],
}

In [None]:
for col in ops_dic:
    tmp = tol_train.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o1_{col}.pkl")
    tmp = None

In [None]:
for col in ops_dic:
    tmp = tol_test.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o1_{col}.pkl")
    tmp = None

In [None]:
# TODO(etveritas): for every user in every day, make features. 2020.05.12

In [None]:
tmp = tol_train.groupby([UID, "time"], sort=False)[["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]].agg(["nunique", at_lenDnunq])\
.groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.to_pickle(f"{UFEDIR}/train_o1_time2columns.pkl")
tmp = None

In [None]:
tmp = tol_test.groupby([UID, "time"], sort=False)[["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]].agg(["nunique", at_lenDnunq])\
.groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.to_pickle(f"{UFEDIR}/test_o1_time2columns.pkl")
tmp = None

In [None]:
# # # before features v03
# # for train
# tmp = tol_train.groupby([UID], sort=False).agg(
#     {
#         UID: ["count"], 
#         "click_times": ["sum", "max", "mean", "std"],
#         "time": ["nunique", "mean", "max", "min"],
#         "creative_id": ["nunique"],
#         "ad_id": ["nunique"],
#         "product_id": ["nunique"],
#         "product_category": ["nunique"],
#         "advertiser_id": ["nunique"],
#         "industry": ["nunique"],
#     }
# )

In [None]:
# # # before features v03
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UFEDIR}/train_o1.pkl")
# tmp = None

In [None]:
# # # before features v03
# # for test
# tmp = tol_test.groupby([UID], sort=False).agg(
#     {
#         UID: ["count"], 
#         "click_times": ["sum", "max", "mean", "std"],
#         "time": ["nunique", "mean", "max", "min"],
#         "creative_id": ["nunique"],
#         "ad_id": ["nunique"],
#         "product_id": ["nunique"],
#         "product_category": ["nunique"],
#         "advertiser_id": ["nunique"],
#         "industry": ["nunique"],
#     }
# )

In [None]:
# # # before features v03
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UFEDIR}/test_o1.pkl")
# tmp = None

In [None]:
gc.collect()

## User Click log (Order 2)

In [None]:
o2_ops_dic = {
    "click_times": ["sum", "max", "mean", "std", at_nunq, at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
    "time": ["nunique", "mean", "max", "min", at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
}

In [None]:
hold_cols = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
# feature v03
for col in hold_cols:
    tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o2_{col}2{col}.pkl")
    tmp = None
    for ocol in o2_ops_dic:
        tmp = tol_train.groupby([UID, col], sort=False)[[ocol]].agg(o2_ops_dic[ocol]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
        tmp.columns = bch_rencol(tmp.columns)
        tmp.to_pickle(f"{UFEDIR}/train_o2_{col}2{ocol}.pkl")
        tmp = None

In [None]:
# feature v03
for col in hold_cols:
    tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o2_{col}2{col}.pkl")
    tmp = None
    for ocol in o2_ops_dic:
        tmp = tol_test.groupby([UID, col], sort=False)[[ocol]].agg(o2_ops_dic[ocol]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
        tmp.columns = bch_rencol(tmp.columns)
        tmp.to_pickle(f"{UFEDIR}/test_o2_{col}2{ocol}.pkl")
        tmp = None

In [None]:
# # # before v03
# # train
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
#     tmp.columns = bch_rencol(tmp.columns)
#     tmp.to_pickle(f"{UFEDIR}/train_o2_{col}.pkl")
#     tmp = None

In [None]:
# # # before v03
# # test
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
#     tmp.columns = bch_rencol(tmp.columns)
#     tmp.to_pickle(f"{UFEDIR}/test_o2_{col}.pkl")
#     tmp = None

In [None]:
gc.collect()

## One-Hot

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Time Windows (Time Bins)

In [None]:
bins = 9
tol_bins = pd.cut(pd.concat([tol_train["time"], tol_test["time"]]), bins, labels=range(bins))

In [None]:
tol_train[f"bins{bins}"] = tol_bins[:len(tol_train)]
tol_test[f"bins{bins}"] = tol_bins[len(tol_train):]

In [None]:
tol_train.

### Order 1

In [None]:
# train
tmp = tol_train.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o1.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o1.pkl")
tmp = None

In [None]:
gc.collect()

### Order 2

In [None]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_train[tol_train[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_test[tol_test[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
gc.collect()

## Conact

In [None]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = np.sort(tol_test[UID].unique())

In [None]:
feat_fname = os.listdir(UFEDIR)

In [None]:
for fname in feat_fname:
    if fname.startswith("train_"):
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_"):
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

In [None]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

In [None]:
train_feat.shape

In [None]:
test_feat.shape

In [None]:
# del tol_test

In [None]:
train_feat.to_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v02.pkl")
test_feat.to_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v02.pkl")

In [None]:
train_feat = pd.read_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v02.pkl")
test_feat = pd.read_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v02.pkl")
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
train_feat.columns

In [None]:
train_feat.drop([col for col in train_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
test_feat.drop([col for col in test_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# list(train_feat.columns)

In [None]:
# train_feat["industry_count_getidxmax"] = train_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# train_feat["product_id_count_getidxmax"] = train_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["industry_count_getidxmax"] = test_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["product_id_count_getidxmax"] = test_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)

# Training&Prediction

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2, random_state=2020)

In [None]:
train_feat.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_tr.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_val.drop(UID, axis=1, inplace=True)

In [None]:
gc.collect()

## For Age

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr, train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val, train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_age_off = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_age_prob = model_lgb_multi_age_off.predict(train_feat_val, num_iteration=model_lgb_multi_age_off.best_iteration)
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

In [None]:
# online
lgbds_train_age = lgb.Dataset(train_feat, train_user["age"]-1)

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_age, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

In [None]:
# Online 

## For Gender

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr, train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val, train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_gender_off = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_gender_prob = model_lgb_multi_gender_off.predict(train_feat_val, num_iteration=model_lgb_multi_gender_off.best_iteration)
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

In [None]:
# online
lgbds_train_gender = lgb.Dataset(train_feat, train_user["gender"]-1)

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_gender, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

In [None]:
# total accuracy
age_acy + gender_acy

# Generate Prediction Result

In [None]:
# model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200511045531.model")

In [None]:
# model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200511034408.model")

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [None]:
res = test_feat[[UID]]

In [None]:
test_feat.drop(UID, axis=1, inplace=True)

In [None]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat, num_iteration=model_lgb_multi_age.best_iteration)]

In [None]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat, num_iteration=model_lgb_multi_gender.best_iteration)]

In [None]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

In [None]:
res["predicted_age"].value_counts()

In [None]:
res["predicted_gender"].value_counts()

In [None]:
o1_res = pd.read_csv(f"{RESDIR}/res-20200509035457.csv")

In [None]:
o1_res["predicted_gender"] = res["predicted_gender"]

In [None]:
res = o1_res

# Cent result to COS

In [None]:
from ti import session
ti_session = session.Session()

In [None]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200512010412.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)