In [1]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget
# !pip install gensim
# !pip install catboost
# !pip install cython

In [2]:
import os
import sys
import logging
import gc
import wget
import time
import tarfile
import zipfile
import functools
import random
import copy
from tqdm import tqdm_notebook, tqdm

import scipy
import numpy as np
import pandas as pd
import catboost as cbt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, KFold

import gensim
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
class Logger(object):
    def __init__(self, filename='default.log', stream=sys.stdout):
        self.terminal = stream
        self.log = open(filename, 'a')
        
    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        
    def flush(self):
        pass

In [4]:
# sys.stdout = Logger("logs/default.log", sys.stdout)
# sys.stderr = Logger("logs/default_err.log", sys.stderr)

In [5]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v05"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [6]:
UID = "user_id"

# Load data (Only once)

In [None]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [None]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [None]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [7]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [8]:
def mynunique(values):
    return values.nunique(dropna=False)
def getidxmax(x):
    return x.idxmax()[1]
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_

In [9]:
OP_SET = ["sum", "max", "min", "mean", "std"]

In [10]:
nesting_level = 0
is_start = None
class Timer:
    def __init__(self):
        self.start = time.time()
        self.history = [self.start]

    def check(self, info):
        current = time.time()
        print(f"[{info}] spend {current - self.history[-1]:0.2f} sec")
        self.history.append(current)

def log(entry):
    global nesting_level
    space = "-" * (4 * nesting_level)
    print(f"{space}{entry}")

def timeit(method, start_log=None):
    @functools.wraps(method)
    def timed(*args, **kw):
        global is_start
        global nesting_level

        if not is_start:
            print()

        is_start = True
        log(f"Start [{method.__name__}]:" + (start_log if start_log else ""))
        log(f'Start time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
        nesting_level += 1

        start_time = time.time()
        result = method(*args, **kw)
        end_time = time.time()

        nesting_level -= 1
        log(f"End   [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
        is_start = False

        return result

    return timed

# Data Exploration (todo)

In [None]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")
# pd.DataFrame(np.sort(test_click_log[UID].unique()), columns=[UID]).to_csv(f"{DDIR}/test/user.csv", index=False)
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [None]:
train_click_log.shape

In [None]:
train_ad.shape

In [None]:
train_user.shape

In [None]:
test_click_log.shape

In [None]:
test_ad.shape

In [None]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
# creative id in train (creative id is unique in train_ad)
len(train_ad)

In [None]:
# creative id in test (creative id is unique in test_ad)
len(test_ad)

In [None]:
# check whether the same creative_id in train and test have same ad info
insect1d = np.intersect1d(train_click_log.creative_id.unique(), test_click_log.creative_id.unique())
print("Same creative id: ", insect1d.shape)
print("Diff number: ", np.sum(train_ad[train_ad.creative_id.isin(insect1d)].values != test_ad[test_ad.creative_id.isin(insect1d)].values))
# checked: they all have same ad info (result is 0)

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(train_click_log.creative_id.unique(), train_ad.creative_id))
print("Diff list: ", np.setdiff1d(train_ad.creative_id, train_click_log.creative_id.unique()))

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(test_click_log.creative_id.unique(), test_ad.creative_id))
print("Diff list: ", np.setdiff1d(test_ad.creative_id, test_click_log.creative_id.unique()))

In [None]:
# click time
sns.lineplot(x=train_click_log.time.value_counts().index, y=train_click_log.time.value_counts())

In [None]:
sns.lineplot(x=test_click_log.time.value_counts().index, y=test_click_log.time.value_counts())

In [None]:
# data_grouped = data.groupby(data.index)
# results = Parallel(n_jobs=8)(delayed(key_func)(group) for name, group in data_grouped)
# data = pd.concat(results)

# Feature engineering

## Get ID sequence (Only once)

In [None]:
# # sort by time, for time series
# train_click_log.sort_values(by="time", inplace=True)
# test_click_log.sort_values(by="time", inplace=True)

In [None]:
# tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
# tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
# del train_click_log, test_click_log
# gc.collect()

In [None]:
# @timeit
# def gen_id_series(data, dtyp="train"):
#     for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#         tmp = data.groupby([UID], sort=False)[[col]].agg(lambda x: [f"word_{y}" for y in x])
#         tmp.columns = bch_rencol(tmp.columns)
#         tmp.to_pickle(f"{UDDIR}/imd/{dtyp}_{col}_seq.pkl")
#         tmp = None

In [None]:
# gen_id_series(tol_train, "train")

In [None]:
# gen_id_series(tol_test, "test")

## TF-IDF&SVD

In [None]:
@timeit
def gen_tfidf(col, nr_max=1):
    train_seq = pd.read_pickle(f"{UDDIR}/imd/train_{col}_seq.pkl")
    test_seq = pd.read_pickle(f"{UDDIR}/imd/test_{col}_seq.pkl")
    tol_seq = pd.concat([train_seq, test_seq])
    tol_seq[col] = tol_seq[col].apply(lambda x: " ".join(x))
    
    train_seq = None
    test_seq = None
    
    tfidf_enc = TfidfVectorizer(ngram_range=(1, nr_max))
    tfidf_vec = tfidf_enc.fit_transform(tol_seq[col].values)
    log(f"TF-IDF shape: {tfidf_vec.shape}")
    
    # save sparse matrix
    scipy.sparse.save_npz(f"{UDDIR}/imd/sparse_{col}.npz", tfidf_vec)
    
    return tol_seq.index

In [None]:
@timeit
def gen_svd(col, index, prefix="tfidf", n_cpt=64):
    tfidf_vec = scipy.sparse.load_npz(f"{UDDIR}/imd/sparse_{col}.npz")
    if tfidf_vec.shape[1] > n_cpt:
        svd_enc = TruncatedSVD(n_components=n_cpt, n_iter=20, random_state=2020)
        mode_svd = svd_enc.fit_transform(tfidf_vec)
    else:
        n_cpt = tfidf_vec.shape[1]
        mode_svd = tfidf_vec.todense()
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = [f"{prefix}_svd_{col}_{i}" for i in range(n_cpt)]
    mode_svd.index = index
    # save svd pkl
    mode_svd.to_pickle(f"{UFEDIR}/{prefix}_svd_{col}.pkl")

In [None]:
tol_idx_dic = dict()

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tol_idx_dic[col] = copy.deepcopy(gen_tfidf(col))

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_svd(col, tol_idx_dic[col])

In [None]:
gc.collect()

## Word2Vec

In [None]:
@timeit
def gen_w2v(col, vesize=300, win=20):
    train_seq = pd.read_pickle(f"{UDDIR}/imd/train_{col}_seq.pkl")
    test_seq = pd.read_pickle(f"{UDDIR}/imd/test_{col}_seq.pkl")
    tol_seq = pd.concat([train_seq, test_seq])
    
    train_seq = None
    test_seq = None
    log("Get Here")
    if os.path.exists(f"{UMDIR}/vectors/w2v_{col}.model"):
        model = gensim.models.Word2Vec.load(f"{UMDIR}/vectors/w2v_{col}.model")
    else:
        model = gensim.models.Word2Vec(sentences=tol_seq[col], size=vesize, window=win, workers=32, sg=0)
        model.save(f"{UMDIR}/vectors/w2v_{col}.model")
    
    w2v_list = list()
    
    for it in tqdm(tol_seq[col]):
        tmp = np.zeros(vesize)
        cnt = 0
        for wd in it:
            cnt += 1
            if wd in model:
                tmp += model[wd]
        w2v_list.append(list(tmp/cnt))
    
    w2v_avg = pd.DataFrame(w2v_list)
    w2v_avg.index = tol_seq.index
    w2v_avg.columns = [f"w2v_avg_{col}_{i}" for i in range(vesize)]
    w2v_avg.to_pickle(f"{UFEDIR}/w2v_avg_{col}.pkl")

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_w2v(col)

In [None]:
print("Done")

## Stats features

In [None]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
del train_click_log, train_ad
del test_click_log, test_ad

### Category map to Label (use k-fold)

In [None]:
@timeit
def gen_catemlb(train_data, test_data, tag, col):
    train_use = pd.merge(train_data[[UID, col]], tag, how="left", on=UID)
    test_use = test_data[[UID, col]]
    
    kfold = KFold(n_splits=5, shuffle=False, random_state=2020)
    
    kf_map_dic = dict()

    re_train_use = pd.DataFrame()
    re_test_use = pd.DataFrame()

    kfn = 0
    for tr_idx, val_idx in kfold.split(train_use):
        tr_ucid, val_ucid = train_use.iloc[tr_idx], train_use.iloc[val_idx]
        kf_map = tr_ucid.groupby([col])[["age", "gender"]].agg(["mean"])
        kf_map.columns = bch_rencol(kf_map.columns, prefix=f"{col}_")
        # only use intersect between train and test
        kf_map.drop(np.setdiff1d(kf_map.index.unique(), test_use[col].unique()), inplace=True)
        kf_map_dic[kfn] = copy.deepcopy(kf_map)
        val_ucid = pd.merge(val_ucid, kf_map, how="left", on=col)
        re_train_use = pd.concat([re_train_use, val_ucid])
        kfn += 1

    # use same KFold split genator
    kfn = 0
    for _, val_idx in kfold.split(test_use):
        val_ucid = test_use.iloc[val_idx]
        val_ucid = pd.merge(val_ucid, kf_map_dic[kfn], how="left", on=col)
        re_test_use = pd.concat([re_test_use, val_ucid])
        kfn += 1

    assert len(train_use) == len(re_train_use)
    assert len(test_use) == len(re_test_use)

    train_use = None
    test_use = None

    tmp = re_train_use.groupby([UID], sort=False)[f"{col}_age_mean", f"{col}_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_stats_{col}_catemlb.pkl")
    tmp = None

    tmp = re_test_use.groupby([UID], sort=False)[f"{col}_age_mean", f"{col}_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_stats_{col}_catemlb.pkl")
    tmp = None
    

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_catemlb(tol_train, tol_test, train_user, col)

### Group by UID

In [None]:
tm_gid = Timer()

In [None]:
ops_dic = {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std", at_percentile(.75), ],
        "time": ["nunique", at_range, at_lenDnunq, at_lenDrange, ],
}

In [None]:
for col in ops_dic:
    tmp = tol_train.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_stats_{col}.pkl")
    tmp = None

In [None]:
tm_gid.check("Group by UID for train")

In [None]:
for col in ops_dic:
    tmp = tol_test.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_stats_{col}.pkl")
    tmp = None

In [None]:
gc.collect()

In [None]:
tm_gid.check("Group by UID for test")

### Group by UID and time

In [None]:
tm_gidt = Timer()

In [None]:
tmp = tol_train.groupby([UID, "time"])[["time"]].agg(["count"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/train_stats_time2{UID}.pkl")
tmp = None

In [None]:
tmp = tol_train.groupby([UID, "time"])[["click_times"]].agg(["sum", "max", "min", "mean", "std"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/train_stats_click_times2{UID}.pkl")
tmp = None

In [None]:
tm_gidt.check("Group by columns to UID for train")

In [None]:
tmp = tol_test.groupby([UID, "time"])[["time"]].agg(["count"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/test_stats_time2{UID}.pkl")
tmp = None

In [None]:
tmp = tol_test.groupby([UID, "time"])[["click_times"]].agg(["sum", "max", "min", "mean", "std"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/test_stats_click_times2{UID}.pkl")
tmp = None

In [None]:
tm_gidt.check("Group by columns to UID for test")

In [None]:
gc.collect()

### One-Hot

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Time Windows (Time slide)

## Concat

In [11]:
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [12]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = test_user[UID]

In [13]:
feat_fname = os.listdir(UFEDIR)

In [14]:
for fname in feat_fname:
    if fname.startswith("w2v_"):
        print("current filename: ", fname)
        cur_w2v = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, cur_w2v, how="left", on=UID)
        test_feat = pd.merge(test_feat, cur_w2v, how="left", on=UID)

current filename:  w2v_avg_ad_id.pkl
current filename:  w2v_avg_creative_id.pkl


In [15]:
for fname in feat_fname:
    if fname.startswith("train_stats_"):
        print("current filename: ", fname)
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_stats_"):
        print("current filename: ", fname)
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

current filename:  train_stats_click_times.pkl
current filename:  test_stats_industry_catemlb.pkl
current filename:  train_stats_product_category_catemlb.pkl
current filename:  test_stats_ad_id_catemlb.pkl
current filename:  train_stats_industry_catemlb.pkl
current filename:  train_stats_time2user_id.pkl
current filename:  test_stats_time2user_id.pkl
current filename:  train_stats_creative_id_catemlb.pkl
current filename:  test_stats_time.pkl
current filename:  test_stats_advertiser_id_catemlb.pkl
current filename:  test_stats_user_id.pkl
current filename:  test_stats_click_times2user_id.pkl
current filename:  test_stats_product_category_catemlb.pkl
current filename:  train_stats_user_id.pkl
current filename:  train_stats_product_id_catemlb.pkl
current filename:  test_stats_click_times.pkl
current filename:  train_stats_ad_id_catemlb.pkl
current filename:  train_stats_advertiser_id_catemlb.pkl
current filename:  test_stats_creative_id_catemlb.pkl
current filename:  train_stats_click_ti

In [20]:
for fname in feat_fname:
    if fname.startswith("tfidf_svd_"):
        print("current filename: ", fname)
        cur_w2v = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, cur_w2v, how="left", on=UID)
        test_feat = pd.merge(test_feat, cur_w2v, how="left", on=UID)

current filename:  tfidf_svd_industry.pkl
current filename:  tfidf_svd_advertiser_id.pkl
current filename:  tfidf_svd_product_category.pkl
current filename:  tfidf_svd_creative_id.pkl
current filename:  tfidf_svd_product_id.pkl
current filename:  tfidf_svd_ad_id.pkl


In [21]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

0

In [22]:
train_feat.shape

(900000, 1039)

In [23]:
test_feat.shape

(1000000, 1039)

In [None]:
res = test_feat[[UID]]

In [None]:
gc.collect()

In [None]:
# train_feat.to_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v05.pkl")
# test_feat.to_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v05.pkl")

In [None]:
# train_feat = pd.read_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v02.pkl")
# test_feat = pd.read_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v02.pkl")
# train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# train_feat.drop([col for col in train_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# test_feat.drop([col for col in test_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# list(train_feat.columns)

# Training&Prediction

## LightGBM

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2, random_state=2020)

In [None]:
train_feat.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_tr.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_val.drop(UID, axis=1, inplace=True)

In [None]:
gc.collect()

## For Age

### Offline

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr, train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val, train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_age_off = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_age_prob = model_lgb_multi_age_off.predict(train_feat_val, num_iteration=model_lgb_multi_age_off.best_iteration)
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

### Online

In [None]:
lgbds_train_age = lgb.Dataset(train_feat, train_user["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
}

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_age, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

### KFold

In [19]:
tm_mage = Timer()

In [24]:
train_feat.drop(UID, axis=1, inplace=True)
test_feat.drop(UID, axis=1, inplace=True)

In [25]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [26]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,     
    
}

In [27]:
scores = []
result_proba = []

In [28]:
for tr_idx, val_idx in kfold.split(train_feat, train_user["age"]-1):
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["age"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["age"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_age, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = accuracy_score(val_pred, val_y)
    result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
    print("current validation score: ", val_score)
print("accuracy score: ", np.mean(scores))

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.622717
[100]	valid_0's multi_error: 0.611333
[150]	valid_0's multi_error: 0.6051
[200]	valid_0's multi_error: 0.601183
[250]	valid_0's multi_error: 0.598694
[300]	valid_0's multi_error: 0.597394
[350]	valid_0's multi_error: 0.596811
[400]	valid_0's multi_error: 0.596261
[450]	valid_0's multi_error: 0.5956
[500]	valid_0's multi_error: 0.595
[550]	valid_0's multi_error: 0.594772
[600]	valid_0's multi_error: 0.594094
[650]	valid_0's multi_error: 0.594022
[700]	valid_0's multi_error: 0.5936
[750]	valid_0's multi_error: 0.594011
Early stopping, best iteration is:
[685]	valid_0's multi_error: 0.59345
current validation score:  0.40655
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.622711
[100]	valid_0's multi_error: 0.612144
[150]	valid_0's multi_error: 0.606367
[200]	valid_0's multi_error: 0.603317
[250]	valid_0's multi_error: 0.600594
[300]	valid_0's mult

In [29]:
predicted_age = np.argmax(np.mean(result_proba, axis=0), axis=1) + 1

In [31]:
tm_mage.check("K-Fold train costs")

[K-Fold train costs] spend 23247.82 sec


In [None]:
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.626389
[100]	valid_0's multi_error: 0.614306
[150]	valid_0's multi_error: 0.609206
[200]	valid_0's multi_error: 0.605878
[250]	valid_0's multi_error: 0.6038
[300]	valid_0's multi_error: 0.601978
[350]	valid_0's multi_error: 0.601167
[400]	valid_0's multi_error: 0.600128
[450]	valid_0's multi_error: 0.600083
[500]	valid_0's multi_error: 0.600483
Early stopping, best iteration is:
[425]	valid_0's multi_error: 0.599928
current validation score:  0.4000722222222222
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.627061

## For Gender

### Offline

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr, train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val, train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [None]:
model_lgb_multi_gender_off = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_gender_prob = model_lgb_multi_gender_off.predict(train_feat_val, num_iteration=model_lgb_multi_gender_off.best_iteration)
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

### Online

In [None]:
lgbds_train_gender = lgb.Dataset(train_feat, train_user["gender"]-1)

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_gender, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

### KFold

In [32]:
tm_mgender = Timer()

In [33]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [34]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [35]:
gender_scores = []
gender_result_proba = []

In [36]:
for tr_idx, val_idx in kfold.split(train_feat, train_user["gender"]-1):
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["gender"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["gender"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_gender, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = accuracy_score(val_pred, val_y)
    gender_result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    gender_scores.append(val_score)
    print("current validation score: ", val_score)
print("accuracy score: ", np.mean(gender_scores))

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0678611
[100]	valid_0's multi_error: 0.0654944
[150]	valid_0's multi_error: 0.0642944
[200]	valid_0's multi_error: 0.0638722
[250]	valid_0's multi_error: 0.0635222
[300]	valid_0's multi_error: 0.0632833
[350]	valid_0's multi_error: 0.0630389
[400]	valid_0's multi_error: 0.0629444
[450]	valid_0's multi_error: 0.0628056
[500]	valid_0's multi_error: 0.0627833
[550]	valid_0's multi_error: 0.0628111
[600]	valid_0's multi_error: 0.0627667
Early stopping, best iteration is:
[532]	valid_0's multi_error: 0.0626167
current validation score:  0.9373833333333333
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0697056
[100]	valid_0's multi_error: 0.0672389
[150]	valid_0's multi_error: 0.0662889
[200]	valid_0's multi_error: 0.0657833
[250]	valid_0's multi_error: 0.0654389
[300]	valid_0's multi_error: 0.0650833
[350]	valid_0's multi_error: 0.0648556
[400]	valid_0's m

In [37]:
predicted_gender = np.argmax(np.mean(gender_result_proba, axis=0), axis=1) + 1

In [38]:
tm_mgender.check("K-Fold train costs")

[K-Fold train costs] spend 1407.34 sec


In [None]:
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0683278
[100]	valid_0's multi_error: 0.0656667
[150]	valid_0's multi_error: 0.0646556
[200]	valid_0's multi_error: 0.0643056
[250]	valid_0's multi_error: 0.0639611
[300]	valid_0's multi_error: 0.0638111
[350]	valid_0's multi_error: 0.0636944
[400]	valid_0's multi_error: 0.0635778
[450]	valid_0's multi_error: 0.0633944
[500]	valid_0's multi_error: 0.0634667
[550]	valid_0's multi_error: 0.0634167
[600]	valid_0's multi_error: 0.06325
[650]	valid_0's multi_error: 0.0632056
Early stopping, best iteration is:
[568]	valid_0's multi_error: 0.06315
current validation score:  0.93685
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0700167
[100]	valid_0's multi_error: 0.0679833
[150]	valid_0's multi_error: 0.0668444
[200]	valid_0's multi_error: 0.0664833

In [48]:
np.array(gender_scores) + np.array(scores)

array([1.34393333, 1.34138889, 1.34196111, 1.3393    , 1.34263889])

## NN

# Generate Prediction Result

In [None]:
# model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200511045531.model")

In [None]:
# model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200511034408.model")

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [None]:
test_feat.drop(UID, axis=1, inplace=True)

In [43]:
res["predicted_age"] = predicted_age
# res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat, num_iteration=model_lgb_multi_age.best_iteration)]

In [44]:
res["predicted_gender"] = predicted_gender
# res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat, num_iteration=model_lgb_multi_gender.best_iteration)]

In [49]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

In [46]:
res["predicted_age"].value_counts()

2    526243
1    394096
3     76065
4      3168
5       401
7        20
6         7
Name: predicted_age, dtype: int64

In [47]:
res["predicted_gender"].value_counts()

1    993180
2      6820
Name: predicted_gender, dtype: int64

In [45]:
res

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,1,1
1,3000002,1,1
2,3000003,1,1
3,3000004,1,1
4,3000005,2,1
...,...,...,...
999995,3999996,1,1
999996,3999997,2,1
999997,3999998,1,1
999998,3999999,2,1


In [None]:
gc.collect()

# Cent result to COS

In [None]:
from ti import session
ti_session = session.Session()

In [None]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200515004850.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)