In [1]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget
# !pip install gensim

In [2]:
import os
import sys
import gc
import wget
import time
import tarfile
import zipfile
import random
import copy
from tqdm import tqdm_notebook, tqdm

import scipy
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, KFold

import gensim
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v04"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [4]:
UID = "user_id"

# Load data (Only once)

In [None]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [None]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [None]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [None]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [None]:
def mynunique(values):
    return values.nunique(dropna=False)

In [None]:
def getidxmax(x):
    return x.idxmax()[1]

In [None]:
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_


In [None]:
OP_SET = ["sum", "max", "min", "mean", "std", getidxmax, "nunique"]

# Data Exploration (todo)

In [None]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [None]:
train_click_log.shape

In [None]:
train_ad.shape

In [None]:
train_user.shape

In [None]:
test_click_log.shape

In [None]:
test_ad.shape

In [None]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
# creative id in train (creative id is unique in train_ad)
len(train_ad)

In [None]:
# creative id in test (creative id is unique in test_ad)
len(test_ad)

In [None]:
# check whether the same creative_id in train and test have same ad info
insect1d = np.intersect1d(train_click_log.creative_id.unique(), test_click_log.creative_id.unique())
print("Same creative id: ", insect1d.shape)
print("Diff number: ", np.sum(train_ad[train_ad.creative_id.isin(insect1d)].values != test_ad[test_ad.creative_id.isin(insect1d)].values))
# checked: they all have same ad info (result is 0)

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(train_click_log.creative_id.unique(), train_ad.creative_id))
print("Diff list: ", np.setdiff1d(train_ad.creative_id, train_click_log.creative_id.unique()))

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(test_click_log.creative_id.unique(), test_ad.creative_id))
print("Diff list: ", np.setdiff1d(test_ad.creative_id, test_click_log.creative_id.unique()))

In [None]:
# click time
sns.lineplot(x=train_click_log.time.value_counts().index, y=train_click_log.time.value_counts())

In [None]:
sns.lineplot(x=test_click_log.time.value_counts().index, y=test_click_log.time.value_counts())

In [None]:
# data_grouped = data.groupby(data.index)
# results = Parallel(n_jobs=8)(delayed(key_func)(group) for name, group in data_grouped)
# data = pd.concat(results)

# Feature engineering

In [None]:
# # Edit
# train_click_log.sort_values(by="time", inplace=True)
# test_click_log.sort_values(by="time", inplace=True)

In [None]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
del train_click_log, train_ad
del test_click_log, test_ad

In [None]:
# tol_data = pd.concat([tol_train, tol_test]).sort_values(by="time").reset_index(drop=True)
# pd.DataFrame(np.sort(tol_test[UID].unique()), columns=[UID]).to_csv(f"{DDIR}/test/user.csv", index=False)
# tol_data.to_csv(f"{DDIR}/tol_data.csv", index=False)

In [None]:
# tmp = tol_train.groupby([UID], sort=False)[["creative_id", "ad_id"]].agg(lambda x: [f"word_{y}" for y in x])
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UDDIR}/imd/train_cidnaid_seq.pkl")
# tmp = None

In [None]:
# tmp = tol_test.groupby([UID], sort=False)[["creative_id", "ad_id"]].agg(lambda x: [f"word_{y}" for y in x])
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UDDIR}/imd/test_cidnaid_seq.pkl")
# tmp = None

In [None]:
gc.collect()

## Sequence features

In [None]:
train_cidnaid_seq = pd.read_pickle(f"{UDDIR}/imd/train_cidnaid_seq.pkl")
test_cidnaid_seq = pd.read_pickle(f"{UDDIR}/imd/test_cidnaid_seq.pkl")

In [None]:
tol_cidnaid_seq = pd.concat([train_cidnaid_seq, test_cidnaid_seq])

In [None]:
tol_cidnaid_seq["creative_id"] = tol_cidnaid_seq["creative_id"].apply(lambda x: " ".join(x))
tol_cidnaid_seq["ad_id"] = tol_cidnaid_seq["ad_id"].apply(lambda x: " ".join(x))

In [None]:
del train_cidnaid_seq, test_cidnaid_seq
gc.collect()

In [None]:
tfidf_enc = TfidfVectorizer(ngram_range=(1, 2), max_features=2000000, max_df=0.8)

In [None]:
tfidf_vec = tfidf_enc.fit_transform(tol_cidnaid_seq["ad_id"].values)

In [None]:
tfidf_vec.shape

In [None]:
# scipy.sparse.save_npz(f"{UDDIR}/imd/sparse_ad_id.npz", tfidf_vec)
# tmp = scipy.sparse.load_npz(f"{UDDIR}/imd/sparse_creative_id.npz")

In [None]:
# tfidf_vec = scipy.sparse.load_npz(f"{UDDIR}/imd/sparse_ad_id.npz")

In [None]:
ncpt = 64
svd_enc = TruncatedSVD(n_components=ncpt, n_iter=20, random_state=2020)
mode_svd = svd_enc.fit_transform(tfidf_vec)
mode_svd = pd.DataFrame(mode_svd)
mode_svd.columns = ['svd_ad_id_{}'.format(i) for i in range(ncpt)]

In [None]:
mode_svd.index = tol_cidnaid_seq.index

In [None]:
mode_svd.to_pickle(f"{UFEDIR}/ad_id_tfidf_svd.pkl")

In [None]:
gc.collect()

## Word2Vec

In [None]:
train_cidnaid_seq = pd.read_pickle(f"{UDDIR}/imd/train_cidnaid_seq.pkl")
test_cidnaid_seq = pd.read_pickle(f"{UDDIR}/imd/test_cidnaid_seq.pkl")

In [None]:
tol_cidnaid_seq = pd.concat([train_cidnaid_seq, test_cidnaid_seq])

In [None]:
vector_size = 128

In [None]:
model = gensim.models.Word2Vec(sentences=tol_cidnaid_seq["ad_id"], size=vector_size, window=5, min_count=5, workers=12, sg=0, iter=10)

In [None]:
model.save(f"{UMDIR}/w2v_ad_id.model")

In [None]:
ad_id_w2v = list()

In [None]:
for col in tqdm(tol_cidnaid_seq["ad_id"]):
    tmp = np.zeros(vector_size)
    for wd in col:
        if wd in model:
            tmp += model[wd]
    ad_id_w2v.append(list(tmp))

In [None]:
cum_ad_id_w2v = pd.DataFrame(ad_id_w2v)

In [None]:
cum_ad_id_w2v.index = tol_cidnaid_seq.index

In [None]:
cum_ad_id_w2v.to_pickle(f"{UFEDIR}/cum_ad_id_w2v.pkl")

In [None]:
gc.collect()

In [None]:
# wv = model.wv
# vocab_list = wv.index2word
# word_idx_dict = {}
# for idx, word in enumerate(vocab_list):
#     word_idx_dict[word] = idx
    
# vectors_arr = wv.vectors
# vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#第0位置的vector为'unk'的vector

# f_wordidx = open(feature_path + 'word_seg_word_idx_dict.pkl', 'wb')
# f_vectors = open(feature_path + 'word_seg_vectors_arr.pkl', 'wb')
# pickle.dump(word_idx_dict, f_wordidx)
# pickle.dump(vectors_arr, f_vectors)
# f_wordidx.close()
# f_vectors.close()

## Creative id map (use k-fold)

In [None]:
train_ucid = pd.merge(tol_train[[UID, "creative_id"]], train_user, how="left", on=UID)
test_ucid = tol_test[[UID, "creative_id"]]

In [None]:
kfold = KFold(n_splits=5, shuffle=False, random_state=2020)

In [None]:
kf_cid_map_dic = dict()

In [None]:
re_train_ucid = pd.DataFrame()
re_test_ucid = pd.DataFrame()

In [None]:
kfn = 0
for tr_idx, val_idx in kfold.split(train_ucid):
    tr_ucid, val_ucid = train_ucid.iloc[tr_idx], train_ucid.iloc[val_idx]
    kf_cid_map = tr_ucid.groupby(["creative_id"])[["age", "gender"]].agg(["mean"])
    kf_cid_map.columns = bch_rencol(kf_cid_map.columns, prefix="creative_id_")
    kf_cid_map.drop(np.setdiff1d(kf_cid_map.index.unique(), test_ucid["creative_id"].unique()), inplace=True)
    kf_cid_map_dic[kfn] = copy.deepcopy(kf_cid_map)
    val_ucid = pd.merge(val_ucid, kf_cid_map, how="left", on="creative_id")
    re_train_ucid = pd.concat([re_train_ucid, val_ucid])
    kfn += 1

In [None]:
kfn = 0
for _, val_idx in kfold.split(test_ucid):
    val_ucid = test_ucid.iloc[val_idx]
    val_ucid = pd.merge(val_ucid, kf_cid_map_dic[kfn], how="left", on="creative_id")
    re_test_ucid = pd.concat([re_test_ucid, val_ucid])
    kfn += 1

In [None]:
len(train_ucid) == len(re_train_ucid)

In [None]:
len(test_ucid) == len(re_test_ucid)

In [None]:
tmp = re_train_ucid.groupby([UID])["creative_id_age_mean", "creative_id_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_o1_cid_map.pkl")
tmp = None

In [None]:
tmp = re_test_ucid.groupby([UID])["creative_id_age_mean", "creative_id_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_o1_cid_map.pkl")
tmp = None

In [None]:
gc.collect()

## User Click log (Order 1)

In [None]:
ops_dic = {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std", at_nunq, at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
        "time": ["nunique", "mean", "max", "min", at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
        "creative_id": ["nunique", at_lenDnunq],
        "ad_id": ["nunique", at_lenDnunq],
        "product_id": ["nunique", at_lenDnunq],
        "product_category": ["nunique", at_lenDnunq],
        "advertiser_id": ["nunique", at_lenDnunq],
        "industry": ["nunique", at_lenDnunq],
}

In [None]:
for col in ops_dic:
    tmp = tol_train.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o1_{col}.pkl")
    tmp = None

In [None]:
for col in ops_dic:
    tmp = tol_test.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o1_{col}.pkl")
    tmp = None

In [None]:
tmp = tol_train.groupby([UID, "time"], sort=False)[["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]].agg(["nunique", at_lenDnunq])\
.groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/train_o1_time2columns.pkl")
tmp = None

In [None]:
tmp = tol_test.groupby([UID, "time"], sort=False)[["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]].agg(["nunique", at_lenDnunq])\
.groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/test_o1_time2columns.pkl")
tmp = None

In [None]:
tmp = pd.read_pickle(f"{UFEDIR}/test_o1_time2columns.pkl")

In [None]:
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_o1_time2columns.pkl")

In [None]:
# # # before features v03
# # for train
# tmp = tol_train.groupby([UID], sort=False).agg(
#     {
#         UID: ["count"], 
#         "click_times": ["sum", "max", "mean", "std"],
#         "time": ["nunique", "mean", "max", "min"],
#         "creative_id": ["nunique"],
#         "ad_id": ["nunique"],
#         "product_id": ["nunique"],
#         "product_category": ["nunique"],
#         "advertiser_id": ["nunique"],
#         "industry": ["nunique"],
#     }
# )

In [None]:
# # # before features v03
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UFEDIR}/train_o1.pkl")
# tmp = None

In [None]:
# # # before features v03
# # for test
# tmp = tol_test.groupby([UID], sort=False).agg(
#     {
#         UID: ["count"], 
#         "click_times": ["sum", "max", "mean", "std"],
#         "time": ["nunique", "mean", "max", "min"],
#         "creative_id": ["nunique"],
#         "ad_id": ["nunique"],
#         "product_id": ["nunique"],
#         "product_category": ["nunique"],
#         "advertiser_id": ["nunique"],
#         "industry": ["nunique"],
#     }
# )

In [None]:
# # # before features v03
# tmp.columns = bch_rencol(tmp.columns)
# tmp.to_pickle(f"{UFEDIR}/test_o1.pkl")
# tmp = None

In [None]:
gc.collect()

## User Click log (Order 2)

In [None]:
o2_ops_dic = {
    "click_times": ["sum", "max", "mean", "std", at_nunq, at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
    "time": ["nunique", "mean", "max", "min", at_range, at_lenDnunq, at_lenDrange, at_percentile(.75), at_percentile(.25)],
}

In [None]:
hold_cols = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
# feature v03
for col in hold_cols:
    tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o2_{col}2{col}.pkl")
    tmp = None
    for ocol in o2_ops_dic:
        tmp = tol_train.groupby([UID, col], sort=False)[[ocol]].agg(o2_ops_dic[ocol]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
        tmp.columns = bch_rencol(tmp.columns)
        tmp.to_pickle(f"{UFEDIR}/train_o2_{col}2{ocol}.pkl")
        tmp = None

In [None]:
# feature v03
for col in hold_cols:
    tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o2_{col}2{col}.pkl")
    tmp = None
    for ocol in o2_ops_dic:
        tmp = tol_test.groupby([UID, col], sort=False)[[ocol]].agg(o2_ops_dic[ocol]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
        tmp.columns = bch_rencol(tmp.columns)
        tmp.to_pickle(f"{UFEDIR}/test_o2_{col}2{ocol}.pkl")
        tmp = None

In [None]:
# # # before v03
# # train
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
#     tmp.columns = bch_rencol(tmp.columns)
#     tmp.to_pickle(f"{UFEDIR}/train_o2_{col}.pkl")
#     tmp = None

In [None]:
# # # before v03
# # test
# for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#     tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
#     tmp.columns = bch_rencol(tmp.columns)
#     tmp.to_pickle(f"{UFEDIR}/test_o2_{col}.pkl")
#     tmp = None

In [None]:
gc.collect()

## One-Hot

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Time Windows (Time Bins)

In [None]:
bins = 9
tol_bins = pd.cut(pd.concat([tol_train["time"], tol_test["time"]]), bins, labels=range(bins))

In [None]:
tol_train[f"bins{bins}"] = tol_bins[:len(tol_train)]
tol_test[f"bins{bins}"] = tol_bins[len(tol_train):]

In [None]:
tol_train.

### Order 1

In [None]:
# train
tmp = tol_train.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o1.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": ["nunique", "mean", "max", "min"],
        "creative_id": ["nunique"],
        "ad_id": ["nunique"],
        "product_id": ["nunique"],
        "product_category": ["nunique"],
        "advertiser_id": ["nunique"],
        "industry": ["nunique"],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o1.pkl")
tmp = None

In [None]:
gc.collect()

### Order 2

In [None]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_train[tol_train[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    for i in range(bins):
        tmp = tol_test[tol_test[f"bins{bins}"] == i].groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
        tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_{i}_")
        tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_{i}_o2_{col}.pkl")
        tmp = None

In [None]:
gc.collect()

## Conact

In [5]:
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [6]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = test_user[UID]

In [7]:
cum_creative_id_w2v = pd.read_pickle(f"{UFEDIR}/cum_creative_id_w2v.pkl")

In [8]:
cum_ad_id_w2v = pd.read_pickle(f"{UFEDIR}/cum_ad_id_w2v.pkl")

In [9]:
creative_id_tfidf_svd = pd.read_pickle(f"{UFEDIR}/creative_id_tfidf_svd.pkl")

In [10]:
ad_id_tfidf_svd = pd.read_pickle(f"{UFEDIR}/ad_id_tfidf_svd.pkl")

In [11]:
train_feat = pd.merge(train_feat, cum_creative_id_w2v, how="left", on=UID)
test_feat = pd.merge(test_feat, cum_creative_id_w2v, how="left", on=UID)

In [12]:
train_feat = pd.merge(train_feat, cum_ad_id_w2v, how="left", on=UID)
test_feat = pd.merge(test_feat, cum_ad_id_w2v, how="left", on=UID)

In [13]:
train_feat = pd.merge(train_feat, creative_id_tfidf_svd, how="left", on=UID)
test_feat = pd.merge(test_feat, creative_id_tfidf_svd, how="left", on=UID)

In [14]:
train_feat = pd.merge(train_feat, ad_id_tfidf_svd, how="left", on=UID)
test_feat = pd.merge(test_feat, ad_id_tfidf_svd, how="left", on=UID)

In [15]:
feat_fname = os.listdir(UFEDIR)

In [16]:
for fname in feat_fname:
    if fname.startswith("train_"):
        print("current filename: ", fname)
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_"):
        print("current filename: ", fname)
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

current filename:  train_o1_cid_map.pkl
current filename:  test_o1_cid_map.pkl


In [17]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

0

In [18]:
train_feat.shape

(900000, 395)

In [19]:
test_feat.shape

(1000000, 395)

In [20]:
gc.collect()

0

In [None]:
# train_feat.to_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v05.pkl")
# test_feat.to_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v05.pkl")

In [None]:
# train_feat = pd.read_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v02.pkl")
# test_feat = pd.read_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v02.pkl")
# train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# train_feat.drop([col for col in train_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# test_feat.drop([col for col in test_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# list(train_feat.columns)

# Training&Prediction

## LightGBM

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2, random_state=2020)

In [None]:
train_feat.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_tr.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_val.drop(UID, axis=1, inplace=True)

In [None]:
gc.collect()

## For Age

### Offline

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr, train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val, train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_age_off = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_age_prob = model_lgb_multi_age_off.predict(train_feat_val, num_iteration=model_lgb_multi_age_off.best_iteration)
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

### Online

In [None]:
lgbds_train_age = lgb.Dataset(train_feat, train_user["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_age, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

### KFold

In [29]:
train_feat.drop(UID, axis=1, inplace=True)
test_feat.drop(UID, axis=1, inplace=True)

In [30]:
res = test_user[[UID]]

In [31]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [32]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,     
    
}

In [33]:
scores = []
result_proba = []

In [None]:
for tr_idx, val_idx in kfold.split(train_feat, train_user["age"]-1):
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["age"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["age"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_age, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = accuracy_score(val_pred, val_y)
    result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
    print("current validation score: ", val_score)
print("accuracy score: ", np.mean(scores))

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.638411
[100]	valid_0's multi_error: 0.629528
[150]	valid_0's multi_error: 0.624606
[200]	valid_0's multi_error: 0.621739
[250]	valid_0's multi_error: 0.619917
[300]	valid_0's multi_error: 0.618433
[350]	valid_0's multi_error: 0.617589
[400]	valid_0's multi_error: 0.617078
[450]	valid_0's multi_error: 0.616794
[500]	valid_0's multi_error: 0.616589
[550]	valid_0's multi_error: 0.616833
Early stopping, best iteration is:
[491]	valid_0's multi_error: 0.616228
current validation score:  0.38377222222222224
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.639794
[100]	valid_0's multi_error: 0.629994
[150]	valid_0's multi_error: 0.625639
[200]	valid_0's multi_error: 0.6226
[250]	valid_0's multi_error: 0.620467
[300]	valid_0's multi_error: 0.619817
[350]	valid_0's multi_error: 0.618394
[400]	valid_0's multi_error: 0.617717
[450]	valid_0's multi_error: 0.6173
[5

In [None]:
predicted_age = np.argmax(np.mean(result_proba, axis=0), axis=1) + 1

In [46]:
res["predicted_age"] = predicted_age

In [41]:
print("hello")

hello


## For Gender

### Offline

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr, train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val, train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [None]:
model_lgb_multi_gender_off = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_gender_prob = model_lgb_multi_gender_off.predict(train_feat_val, num_iteration=model_lgb_multi_gender_off.best_iteration)
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

### Online

In [None]:
lgbds_train_gender = lgb.Dataset(train_feat, train_user["gender"]-1)

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_gender, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

### KFold

In [104]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [105]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [108]:
gender_scores = []
gender_result_proba = []

In [109]:
for tr_idx, val_idx in kfold.split(train_feat, train_user["gender"]-1):
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["gender"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["gender"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_gender, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    print(val_pred.shape)
    val_score = accuracy_score(val_pred, val_y)
    gender_result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    gender_scores.append(val_score)
    print("current validation score: ", val_score)
print("accuracy score: ", np.mean(gender_scores))

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0769889
[100]	valid_0's multi_error: 0.0740056
[150]	valid_0's multi_error: 0.0725722
[200]	valid_0's multi_error: 0.0719167
[250]	valid_0's multi_error: 0.0712667
[300]	valid_0's multi_error: 0.0709833
[350]	valid_0's multi_error: 0.07065
[400]	valid_0's multi_error: 0.0704556
[450]	valid_0's multi_error: 0.0705111
[500]	valid_0's multi_error: 0.0702056
[550]	valid_0's multi_error: 0.0702778
[600]	valid_0's multi_error: 0.0703556
Early stopping, best iteration is:
[533]	valid_0's multi_error: 0.0700889
(180000,)
current validation score:  0.9299111111111111
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0780278
[100]	valid_0's multi_error: 0.0749444
[150]	valid_0's multi_error: 0.0738611
[200]	valid_0's multi_error: 0.0729833
[250]	valid_0's multi_error: 0.0724833
[300]	valid_0's multi_error: 0.0722667
[350]	valid_0's multi_error: 0.0721722
[400]	val

In [None]:
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0769889
[100]	valid_0's multi_error: 0.0740056
[150]	valid_0's multi_error: 0.0725722
[200]	valid_0's multi_error: 0.0719167
[250]	valid_0's multi_error: 0.0712667
[300]	valid_0's multi_error: 0.0709833
[350]	valid_0's multi_error: 0.07065
[400]	valid_0's multi_error: 0.0704556
[450]	valid_0's multi_error: 0.0705111
[500]	valid_0's multi_error: 0.0702056
[550]	valid_0's multi_error: 0.0702778
[600]	valid_0's multi_error: 0.0703556
Early stopping, best iteration is:
[533]	valid_0's multi_error: 0.0700889
current validation score:  0.9299111111111111
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0780278
[100]	valid_0's multi_error: 0.0749444
[150]	valid_0's multi_error: 0.0738611
[200]	valid_0's multi_error: 0.0729833
[250]	valid_0's multi_error: 0.0724833
[300]	valid_0's multi_error: 0.0722667
[350]	valid_0's multi_error: 0.0721722
[400]	valid_0's multi_error: 0.0719389
[450]	valid_0's multi_error: 0.072
[500]	valid_0's multi_error: 0.0718778
[550]	valid_0's multi_error: 0.0719444
[600]	valid_0's multi_error: 0.0717833
[650]	valid_0's multi_error: 0.0716056
[700]	valid_0's multi_error: 0.0715778
[750]	valid_0's multi_error: 0.0714444
[800]	valid_0's multi_error: 0.0715
[850]	valid_0's multi_error: 0.0714389
[900]	valid_0's multi_error: 0.0714
[950]	valid_0's multi_error: 0.0712778
[1000]	valid_0's multi_error: 0.0711056
[1050]	valid_0's multi_error: 0.0711222
[1100]	valid_0's multi_error: 0.0711056
Early stopping, best iteration is:
[1015]	valid_0's multi_error: 0.0710556
current validation score:  0.9289444444444445
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0769889
[100]	valid_0's multi_error: 0.0738944
[150]	valid_0's multi_error: 0.0724889
[200]	valid_0's multi_error: 0.0717111
[250]	valid_0's multi_error: 0.0712667
[300]	valid_0's multi_error: 0.0709222
[350]	valid_0's multi_error: 0.0709667
[400]	valid_0's multi_error: 0.0706278
[450]	valid_0's multi_error: 0.0706222
[500]	valid_0's multi_error: 0.0703778
[550]	valid_0's multi_error: 0.0703278
[600]	valid_0's multi_error: 0.07025
[650]	valid_0's multi_error: 0.0702
[700]	valid_0's multi_error: 0.0701333
[750]	valid_0's multi_error: 0.0701111
[800]	valid_0's multi_error: 0.0701222
[850]	valid_0's multi_error: 0.0699333
[900]	valid_0's multi_error: 0.0699611
[950]	valid_0's multi_error: 0.0700056
Early stopping, best iteration is:
[869]	valid_0's multi_error: 0.0699056
current validation score:  0.9300944444444444
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0785556
[100]	valid_0's multi_error: 0.0755944
[150]	valid_0's multi_error: 0.0741278
[200]	valid_0's multi_error: 0.0734722
[250]	valid_0's multi_error: 0.07285
[300]	valid_0's multi_error: 0.0724444
[350]	valid_0's multi_error: 0.0723333
[400]	valid_0's multi_error: 0.0722167
[450]	valid_0's multi_error: 0.0720944
[500]	valid_0's multi_error: 0.0720889
[550]	valid_0's multi_error: 0.0719889
[600]	valid_0's multi_error: 0.0718722
[650]	valid_0's multi_error: 0.0719389
[700]	valid_0's multi_error: 0.0717889
[750]	valid_0's multi_error: 0.0717556
[800]	valid_0's multi_error: 0.0717722
[850]	valid_0's multi_error: 0.0716722
[900]	valid_0's multi_error: 0.0716944
[950]	valid_0's multi_error: 0.0714667
[1000]	valid_0's multi_error: 0.0715222
[1050]	valid_0's multi_error: 0.0715056
Early stopping, best iteration is:
[955]	valid_0's multi_error: 0.0714167
current validation score:  0.9285833333333333
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0775389
[100]	valid_0's multi_error: 0.0746444
[150]	valid_0's multi_error: 0.0731056
[200]	valid_0's multi_error: 0.0726222
[250]	valid_0's multi_error: 0.0721111
[300]	valid_0's multi_error: 0.0718667
[350]	valid_0's multi_error: 0.0714333
[400]	valid_0's multi_error: 0.0712111
[450]	valid_0's multi_error: 0.0710889
[500]	valid_0's multi_error: 0.0707389
[550]	valid_0's multi_error: 0.0708778
[600]	valid_0's multi_error: 0.0706722
[650]	valid_0's multi_error: 0.07065
[700]	valid_0's multi_error: 0.07045
[750]	valid_0's multi_error: 0.0702611
[800]	valid_0's multi_error: 0.0703389
Early stopping, best iteration is:
[735]	valid_0's multi_error: 0.0702222
current validation score:  0.9297777777777778
accuracy score:  0.3834677777777778

In [110]:
np.array(gender_scores) + np.array(scores)

array([1.31368333, 1.31263889, 1.31427222, 1.30891111, 1.31514444])

In [111]:
gender_result_proba[0].shape

(1000000, 2)

In [78]:
np.mean(gender_result_proba, axis=0).shape

(1000000, 10)

In [62]:
predicted_gender = np.argmax(np.mean(gender_result_proba, axis=0), axis=1) + 1

In [63]:
res["predicted_gender"] = predicted_gender

# Generate Prediction Result

In [None]:
# model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200511045531.model")

In [None]:
# model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200511034408.model")

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [None]:
res = test_feat[[UID]]

In [None]:
test_feat.drop(UID, axis=1, inplace=True)

In [None]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat, num_iteration=model_lgb_multi_age.best_iteration)]

In [None]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat, num_iteration=model_lgb_multi_gender.best_iteration)]

In [95]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

In [93]:
res["predicted_age"].value_counts()

3     311174
2     157088
5     140865
4     138148
6     122965
7      57124
1      23535
8      20220
9      16601
10     12280
Name: predicted_age, dtype: int64

In [94]:
res["predicted_gender"].value_counts()

1    674121
2    325879
Name: predicted_gender, dtype: int64

In [51]:
gc.collect()

350

# Cent result to COS

In [96]:
from ti import session
ti_session = session.Session()

In [97]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200515004850.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)