In [1]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget

In [2]:
import os
import sys
import gc
import wget
import time
import tarfile
import zipfile
import random

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v01"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [4]:
UID = "user_id"

# Load data (Only once)

In [5]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [6]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [7]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [8]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [9]:
def mynunique(values):
    return values.nunique(dropna=False)
def getidxmax(x):
    return x.idxmax()[1]
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_

In [None]:
OP_SET = ["sum", "max", "min", "mean", "std", "median", "skew", percentile]

# Data Exploration (todo)

In [None]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")

In [None]:
train_click_log.shape

In [None]:
train_ad.shape

In [None]:
train_user.shape

In [None]:
test_click_log.shape

In [None]:
test_ad.shape

In [None]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
train_ad.count()

In [None]:
test_ad.count()

# Feature engineering

In [None]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
del train_click_log, train_ad
del test_click_log, test_ad

In [None]:
gc.collect()

## User Click log (Order 1)

In [None]:
# for train
tmp = tol_train.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
)

In [None]:
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_o1.pkl")
tmp = None

In [None]:
# for test
tmp = tol_test.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
)

In [None]:
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_o1.pkl")
tmp = None

In [None]:
gc.collect()

## User Click log (Order 2)

In [None]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o2_{col}.pkl")
    tmp = None

In [None]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o2_{col}.pkl")
    tmp = None

In [None]:
gc.collect()

## One-Hot

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Time Windows (Time Bins)

In [None]:
bins = 9
tol_bins = pd.cut(pd.concat([tol_train["time"], tol_test["time"]]), bins, labels=range(bins))

In [None]:
tol_train[f"bins{bins}"] = tol_bins[:len(tol_train)]
tol_test[f"bins{bins}"] = tol_bins[len(tol_train):]

### Order 1

In [None]:
# train
tmp = tol_train.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"],
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o1.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o1.pkl")
tmp = None

In [None]:
gc.collect()

### Order 2

In [None]:
# space consuming
# # train
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_train.groupby([UID, f"bins{bins}", col], sort=False)[[col]].agg(["count"]).groupby([UID, f"bins{bins}"]).agg(OP_SET).unstack()
#     tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")
#     tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o2_{col}.pkl")
#     tmp = None

In [None]:
# space consuming
# # test
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_test.groupby([UID, f"bins{bins}", col], sort=False)[[col]].agg(["count"]).groupby([UID, f"bins{bins}"]).agg(OP_SET).unstack()
#     tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")
#     tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o2_{col}.pkl")
#     tmp = None

In [None]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_train[tol_train[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_test[tol_test[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
gc.collect()

## Conact

In [None]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = np.sort(tol_test[UID].unique())

In [None]:
feat_fname = os.listdir(UFEDIR)

In [None]:
for fname in feat_fname:
    if fname.startswith("train_"):
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_"):
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

In [None]:
train_feat.drop([col for col in train_feat.columns if col.find("getidxmax") != -1], axis=1, inplace=True)
test_feat.drop([col for col in test_feat.columns if col.find("getidxmax") != -1], axis=1, inplace=True)

In [None]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

In [None]:
train_feat.shape

In [None]:
test_feat.shape

In [None]:
del tol_test

In [None]:
train_feat.to_pickle(f"{UDDIR}/train_feat_tol.pkl")
test_feat.to_pickle(f"{UDDIR}/test_feat_tol.pkl")

In [None]:
# train_feat = pd.read_pickle(f"{UDDIR}/train_feat_tol.pkl")
test_feat = pd.read_pickle(f"{UDDIR}/test_feat_tol.pkl")
# train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# train_feat["industry_count_getidxmax"] = train_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# train_feat["product_id_count_getidxmax"] = train_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["industry_count_getidxmax"] = test_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["product_id_count_getidxmax"] = test_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)

# Training&Prediction

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2)

In [None]:
del train_feat, train_user

In [None]:
gc.collect()

## For Age

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

In [None]:
train_val_age_prob = model_lgb_multi_age.predict(train_feat_val.drop(UID, axis=1), num_iteration=model_lgb_multi_age.best_iteration)

In [None]:
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]

In [None]:
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

In [None]:
# 20:05-21

## For Gender

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

In [None]:
train_val_gender_prob = model_lgb_multi_gender.predict(train_feat_val.drop(UID, axis=1), num_iteration=model_lgb_multi_gender.best_iteration)

In [None]:
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]

In [None]:
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

In [None]:
gender_acy

In [None]:
# total accuracy
age_acy + gender_acy

# Generate Prediction Result

In [None]:
model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200510133141.model")

In [None]:
model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200510120440.model")

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [None]:
res = test_feat[[UID]]

In [None]:
test_feat.drop(UID, axis=1, inplace=True)

In [None]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat, num_iteration=model_lgb_multi_age.best_iteration)]

In [None]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat, num_iteration=model_lgb_multi_gender.best_iteration)]

In [None]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

In [None]:
res["predicted_age"].value_counts()

In [None]:
res["predicted_gender"].value_counts()

In [None]:
res.shape

# Cent result to COS

In [None]:
from ti import session
ti_session = session.Session()

In [None]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200510140336.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)

In [None]:
uid --> unique

In [None]:
tol_train.groupby([uid,tid, cid, aid, pid, iid])[iid].agg(["count"]).groupby([uid, tid, cid, aid, pid]).agg(["max", "min", "mean", "sum", "std"])

In [None]:
uid aid cid count
1 2 4 1
1 2 5 2
1 3 4 2
1 3 3 3
1 3 2 3

In [None]:
tol_train.groupby([uid, tid, cid, aid, pid])[pid].agg(["count"]).groupby()

In [None]:
tol_train.groupby([uid,tid, cid, aid, pid, iid])[click_times].agg(["max"])

In [None]:
9**3