In [4]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget

In [5]:
import os
import sys
import gc
import wget
import time
import tarfile
import zipfile
import random
from itertools import product, combinations
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v01"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [7]:
UID = "user_id"

In [8]:
from scipy.special import comb, perm

In [9]:
cnt = 0
for i in range(2, 64):
    cnt += comb(63,i)

In [10]:
comb(63,2)

1953.0

In [11]:
int(cnt/(8*3))

384307168202282176

In [12]:
int(cnt)

9223372036854772736

# Load data (Only once)

In [13]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [14]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [15]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [16]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [17]:
def mynunique(values):
    return values.nunique(dropna=False)
def getidxmax(x):
    return x.idxmax()[1]
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_

In [18]:
OP_SET1 = ["nunique", "sum", "max", "min", "mean", "std", "median", "skew", at_percentile(0.25), at_percentile(0.75)]
OP_SET2 = ["sum", "max", "min", "mean", "std", "median", "skew", at_percentile(0.25), at_percentile(0.75)]

In [19]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Data Exploration (todo)

In [20]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [21]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")

In [22]:
train_click_log.shape

(30082771, 4)

In [23]:
train_ad.shape

(2481135, 6)

In [24]:
train_user.shape

(900000, 3)

In [25]:
test_click_log.shape

(33585512, 4)

In [26]:
test_ad.shape

(2618159, 6)

In [27]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [28]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [29]:
train_ad.count()

creative_id         2481135
ad_id               2481135
product_id          2481135
product_category    2481135
advertiser_id       2481135
industry            2481135
dtype: int64

In [30]:
test_ad.count()

creative_id         2618159
ad_id               2618159
product_id          2618159
product_category    2618159
advertiser_id       2618159
industry            2618159
dtype: int64

# Feature engineering

In [31]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [32]:
del train_click_log, train_ad
del test_click_log, test_ad

In [33]:
gc.collect()

0

## One key(O1)

In [None]:
# for train
tmp = tol_train.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": OP_SET1,
        "time": OP_SET1,
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
)

In [None]:
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_o1.pkl")
tmp = None

In [None]:
# for test
tmp = tol_test.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": OP_SET1,
        "time": OP_SET1,
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
)

In [None]:
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_o1.pkl")
tmp = None

In [None]:
gc.collect()

## Comb Key(O2)

In [None]:
id_list = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
# train
for i in tqdm(range(1, 7)):
    for cl in combinations(id_list, i):
        key_set = [UID] + list(cl)
        tmp = tol_train.groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"{'C'.join(key_set[1:-1])}_")
        tmp.to_pickle(f"{UFEDIR}/train_o2_{'C'.join(key_set)}.pkl")
        tmp = None

In [None]:
# test
for i in tqdm(range(1, 7)):
    for cl in combinations(id_list, i):
        key_set = [UID] + list(cl)
        tmp = tol_test.groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"{'C'.join(key_set[1:-1])}_")
        tmp.to_pickle(f"{UFEDIR}/test_o2_{'C'.join(key_set)}.pkl")
        tmp = None

In [None]:
gc.collect()

## Time Windows

### Time Bins

In [3]:
bins_set = [3, 7, 10, 30]

In [None]:
for bins in bins_set:
    tol_bins = pd.cut(pd.concat([tol_train["time"], tol_test["time"]]), bins, labels=range(bins))
    tol_train[f"bins{bins}"] = tol_bins[:len(tol_train)]
    tol_test[f"bins{bins}"] = tol_bins[len(tol_train):]

#### One Key(O1)

In [None]:
# For train
for bins in bins_set:
    for cb in tqdm(range(bins)):
        tmp = tol_train[tol_train[f"bins{bins}"] == cb].groupby([UID], sort=False).agg(
            {
                UID: ["count"], 
                "click_times": OP_SET1,
                "time": OP_SET1,
                "creative_id": ["nunique"],
                "ad_id": ["nunique"],
                "product_id": ["nunique"],
                "product_category": ["nunique"],
                "advertiser_id": ["nunique"],
                "industry": ["nunique"],
            }
        )
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{cb}_")
        tmp.to_pickle(f"{UFEDIR}/train_o1_bins{bins}_{cb}.pkl")
        tmp = None

In [None]:
# For test
for bins in bins_set:
    for cb in tqdm(range(bins)):
        tmp = tol_test[tol_test[f"bins{bins}"] == cb].groupby([UID], sort=False).agg(
            {
                UID: ["count"], 
                "click_times": OP_SET1,
                "time": OP_SET1,
                "creative_id": ["nunique"],
                "ad_id": ["nunique"],
                "product_id": ["nunique"],
                "product_category": ["nunique"],
                "advertiser_id": ["nunique"],
                "industry": ["nunique"],
            }
        )
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{cb}_")
        tmp.to_pickle(f"{UFEDIR}/test_o1_bins{bins}_{cb}.pkl")
        tmp = None

In [None]:
gc.collect()

#### Comb Key(O2)

In [None]:
id_list = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
# train
for bins in bins_set:
    for cb in tqdm(range(bins)):
        for i in tqdm(range(1, 7)):
            for cl in combinations(id_list, i):
                key_set = [UID] + list(cl)
                tmp = tol_train[tol_train[f"bins{bins}"] == cb].groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
                tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{cb}_{'C'.join(key_set[1:-1])}_")
                tmp.to_pickle(f"{UFEDIR}/train_o2_bins{bins}_{cb}_{'C'.join(key_set)}.pkl")
                tmp = None

In [None]:
# test
for bins in bins_set:
    for cb in tqdm(range(bins))
        for i in tqdm(range(1, 7)):
            for cl in combinations(id_list, i):
                key_set = [UID] + list(cl)
                tmp = tol_test[tol_test[f"bins{bins}"] == cb].groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
                tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{cb}_{'C'.join(key_set[1:-1])}_")
                tmp.to_pickle(f"{UFEDIR}/test_o2_bins{bins}_{cb}_{'C'.join(key_set)}.pkl")
                tmp = None

### Time Slides

In [2]:
slide_set = [1, 2, 3, 7, 14, 21, 30, 60, 90]

In [39]:
tol_train["max_time"] = tol_train.groupby([UID], sort=False)["time"].transform("max")
tol_test["max_time"] = tol_test.groupby([UID], sort=False)["time"].transform("max")

#### One Key(O1)

In [None]:
# For train
for win in slide_set:
    tmp = tol_train[tol_train["time"] >= tol_train["max_time"] - win].groupby([UID], sort=False).agg(
        {
            UID: ["count"], 
            "click_times": OP_SET1,
            "time": OP_SET1,
            "creative_id": ["nunique"],
            "ad_id": ["nunique"],
            "product_id": ["nunique"],
            "product_category": ["nunique"],
            "advertiser_id": ["nunique"],
            "industry": ["nunique"],
        }
    )
    tmp.columns = bch_rencol(tmp.columns, prefix=f"slide{win}_")
    tmp.to_pickle(f"{UFEDIR}/train_o1_slide{win}.pkl")
    tmp = None

In [None]:
# For test
for win in slide_set:
    tmp = tol_test[tol_test["time"] >= tol_test["max_time"] - win].groupby([UID], sort=False).agg(
        {
            UID: ["count"], 
            "click_times": OP_SET1,
            "time": OP_SET1,
            "creative_id": ["nunique"],
            "ad_id": ["nunique"],
            "product_id": ["nunique"],
            "product_category": ["nunique"],
            "advertiser_id": ["nunique"],
            "industry": ["nunique"],
        }
    )
    tmp.columns = bch_rencol(tmp.columns, prefix=f"slide{win}_")
    tmp.to_pickle(f"{UFEDIR}/test_o1_slide{win}.pkl")
    tmp = None

#### Comb Key(O2)

In [None]:
id_list = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
# train
for win in slide_set:
    for i in tqdm(range(1, 7)):
        for cl in combinations(id_list, i):
            key_set = [UID] + list(cl)
            tmp = tol_train[tol_train["time"] >= tol_train["max_time"] - win].groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
            tmp.columns = bch_rencol(tmp.columns, prefix=f"slide{win}_{'C'.join(key_set[1:-1])}_")
            tmp.to_pickle(f"{UFEDIR}/train_o2_slide{win}_{'C'.join(key_set)}.pkl")
            tmp = None

In [None]:
# test
for win in slide_set:
    for i in tqdm(range(1, 7)):
        for cl in combinations(id_list, i):
            key_set = [UID] + list(cl)
            tmp = tol_test[tol_test["time"] >= tol_test["max_time"] - win].groupby(key_set, sort=False)[key_set[-1:]].agg(["count"]).groupby([UID]).agg(OP_SET2)
            tmp.columns = bch_rencol(tmp.columns, prefix=f"slide{win}_{'C'.join(key_set[1:-1])}_")
            tmp.to_pickle(f"{UFEDIR}/test_o2_slide{win}_{'C'.join(key_set)}.pkl")
            tmp = None

## One-Hot

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Conact

In [None]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = np.sort(tol_test[UID].unique())

In [None]:
feat_fname = sorted(os.listdir(UFEDIR))

In [None]:
for fname in feat_fname:
#     if fname.startswith("train_o1") or (fname.startswith("train_o2") and fname.count("C") == 1):
    if fname.startswith("train"):
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
#     elif fname.startswith("test_o1") or (fname.startswith("test_o2") and fname.count("C") == 1):
    elif fname.startswith("test"):
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

In [None]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

In [None]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(test_feat[UID] != np.sort(tol_test[UID].unique()))

In [None]:
np.sum(train_feat.columns != test_feat.columns)

In [None]:
train_feat.shape

In [None]:
test_feat.shape

In [None]:
train_feat.head()

In [None]:
test_feat.head()

In [None]:
# del tol_test

In [None]:
gc.collect()

In [None]:
# train_feat.to_pickle(f"{UDDIR}/train_feat_tol.pkl")
# test_feat.to_pickle(f"{UDDIR}/test_feat_tol.pkl")

In [None]:
# train_feat = pd.read_pickle(f"{UDDIR}/train_feat_tol.pkl")
# test_feat = pd.read_pickle(f"{UDDIR}/test_feat_tol.pkl")
# train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# train_feat["industry_count_getidxmax"] = train_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# train_feat["product_id_count_getidxmax"] = train_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["industry_count_getidxmax"] = test_feat["industry_count_getidxmax"].replace("\\N", -1).astype(float)
# test_feat["product_id_count_getidxmax"] = test_feat["product_id_count_getidxmax"].replace("\\N", -1).astype(float)

# Training&Prediction

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2)

In [None]:
# del train_feat, train_user

In [None]:
gc.collect()

## For Age

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
# ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
# model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

In [None]:
train_val_age_prob = model_lgb_multi_age.predict(train_feat_val.drop(UID, axis=1), num_iteration=model_lgb_multi_age.best_iteration)

In [None]:
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]

In [None]:
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

In [None]:
# 20:05-21

## For Gender

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
# ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
# model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

In [None]:
train_val_gender_prob = model_lgb_multi_gender.predict(train_feat_val.drop(UID, axis=1), num_iteration=model_lgb_multi_gender.best_iteration)

In [None]:
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]

In [None]:
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

In [None]:
age_acy

In [None]:
gender_acy

In [None]:
# total accuracy
age_acy + gender_acy

# Generate Prediction Result

In [None]:
model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200510133141.model")

In [None]:
model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200510120440.model")

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
# for age
pd.DataFrame({
    "feature": model_lgb_multi_age.feature_name(), 
    "importance": model_lgb_multi_age.feature_importance()}).sort_values(by="importance", 
                                                                         ascending=False).head(20)["feature"].reset_index(drop=True)

In [None]:
# for gender
pd.DataFrame({
    "feature": model_lgb_multi_gender.feature_name(), 
    "importance": model_lgb_multi_gender.feature_importance()}).sort_values(by="importance", 
                                                                         ascending=False).head(20)["feature"].reset_index(drop=True)

In [None]:
gc.collect()

In [None]:
res = test_feat[[UID]]

In [None]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat.drop(UID, axis=1), num_iteration=model_lgb_multi_age.best_iteration)]

In [None]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat.drop(UID, axis=1), num_iteration=model_lgb_multi_gender.best_iteration)]

In [None]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-o1o2all{res_suffix}.csv", index=False)

In [None]:
res["predicted_age"].value_counts()

In [None]:
res["predicted_gender"].value_counts()

In [None]:
res.shape

In [None]:
test_feat.shape

In [None]:
del tol_test, tol_train

In [None]:
gc.collect()

# Cent result to COS

In [None]:
from ti import session
ti_session = session.Session()

In [None]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200510140336.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)

In [None]:
uid --> unique

In [None]:
tol_train.groupby([uid,tid, cid, aid, pid, iid])[iid].agg(["count"]).groupby([uid, tid, cid, aid, pid]).agg(["max", "min", "mean", "sum", "std"])

In [None]:
uid aid cid count
1 2 4 1
1 2 5 2
1 3 4 2
1 3 3 3
1 3 2 3

In [None]:
tol_train.groupby([uid, tid, cid, aid, pid])[pid].agg(["count"]).groupby()

In [None]:
tol_train.groupby([uid,tid, cid, aid, pid, iid])[click_times].agg(["max"])

In [None]:
9**3