In [1]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget
# !pip install gensim
# !pip install catboost
# !pip install cython

In [2]:
import os
import sys
import logging
import gc
import wget
import time
import tarfile
import zipfile
import functools
import random
import copy
from tqdm import tqdm_notebook, tqdm

import scipy
import numpy as np
import pandas as pd
import catboost as cbt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

import gensim
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
class Logger(object):
    def __init__(self, filename='default.log', stream=sys.stdout):
        self.terminal = stream
        self.log = open(filename, 'a')
        
    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        
    def flush(self):
        pass

In [4]:
# sys.stdout = Logger("logs/default.log", sys.stdout)
# sys.stderr = Logger("logs/default_err.log", sys.stderr)

In [5]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v05"
UMDIR = "user_data/model_data"
RESDIR = "prediction_result"

In [6]:
UID = "user_id"

# Load data (Only once)

In [7]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [8]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [9]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [10]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [11]:
def mynunique(values):
    return values.nunique(dropna=False)
def getidxmax(x):
    return x.idxmax()[1]
# for time series
def at_len(x):
    return len(x)

def at_sum(x):
    return np.sum(x)

def at_max(x):
    return np.max(x)

def at_min(x):
    return np.min(x)

def at_mean(x):
    return np.mean(x)

def at_range(x):
    return at_max(x) - at_min(x)

def at_nunq(x):
    return len(set(x))

def at_lenDrange(x):
    return at_len(x)/(at_range(x)+1)

def at_lenDnunq(x):
    return at_len(x)/at_nunq(x)

def at_percentile(n):
    def at_percentile_(x):
        return np.percentile(x, n)
    at_percentile_.__name__ = f"at_percentile_{n}"
    return at_percentile_

In [12]:
OP_SET = ["sum", "max", "min", "mean", "std"]

In [13]:
nesting_level = 0
is_start = None
class Timer:
    def __init__(self):
        self.start = time.time()
        self.history = [self.start]

    def check(self, info):
        current = time.time()
        print(f"[{info}] spend {current - self.history[-1]:0.2f} sec")
        self.history.append(current)

def log(entry):
    global nesting_level
    space = "-" * (4 * nesting_level)
    print(f"{space}{entry}")

def timeit(method, start_log=None):
    @functools.wraps(method)
    def timed(*args, **kw):
        global is_start
        global nesting_level

        if not is_start:
            print()

        is_start = True
        log(f"Start [{method.__name__}]:" + (start_log if start_log else ""))
        log(f'Start time: {time.strftime("%Y-%m-%d %H:%M:%S")}')
        nesting_level += 1

        start_time = time.time()
        result = method(*args, **kw)
        end_time = time.time()

        nesting_level -= 1
        log(f"End   [{method.__name__}]. Time elapsed: {end_time - start_time:0.2f} sec.")
        is_start = False

        return result

    return timed

In [14]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Data Exploration (todo)

In [None]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")
# pd.DataFrame(np.sort(test_click_log[UID].unique()), columns=[UID]).to_csv(f"{DDIR}/test/user.csv", index=False)
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [None]:
train_click_log.shape

In [None]:
train_ad.shape

In [None]:
train_user.shape

In [None]:
test_click_log.shape

In [None]:
test_ad.shape

In [None]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [None]:
# creative id in train (creative id is unique in train_ad)
len(train_ad)

In [None]:
# creative id in test (creative id is unique in test_ad)
len(test_ad)

In [None]:
# check whether the same creative_id in train and test have same ad info
insect1d = np.intersect1d(train_click_log.creative_id.unique(), test_click_log.creative_id.unique())
print("Same creative id: ", insect1d.shape)
print("Diff number: ", np.sum(train_ad[train_ad.creative_id.isin(insect1d)].values != test_ad[test_ad.creative_id.isin(insect1d)].values))
# checked: they all have same ad info (result is 0)

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(train_click_log.creative_id.unique(), train_ad.creative_id))
print("Diff list: ", np.setdiff1d(train_ad.creative_id, train_click_log.creative_id.unique()))

In [None]:
# check whether click and ad have diff creative_id
print("Diff list: ", np.setdiff1d(test_click_log.creative_id.unique(), test_ad.creative_id))
print("Diff list: ", np.setdiff1d(test_ad.creative_id, test_click_log.creative_id.unique()))

In [None]:
# click time
sns.lineplot(x=train_click_log.time.value_counts().index, y=train_click_log.time.value_counts())

In [None]:
sns.lineplot(x=test_click_log.time.value_counts().index, y=test_click_log.time.value_counts())

In [None]:
# data_grouped = data.groupby(data.index)
# results = Parallel(n_jobs=8)(delayed(key_func)(group) for name, group in data_grouped)
# data = pd.concat(results)

# Feature engineering

## Get ID sequence (Only once)

In [None]:
# # sort by time, for time series
# train_click_log.sort_values(by="time", inplace=True)
# test_click_log.sort_values(by="time", inplace=True)

In [None]:
# tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
# tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
# del train_click_log, test_click_log
# gc.collect()

In [None]:
# @timeit
# def gen_id_series(data, dtyp="train"):
#     for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
#         tmp = data.groupby([UID], sort=False)[[col]].agg(lambda x: [f"word_{y}" for y in x])
#         tmp.columns = bch_rencol(tmp.columns)
#         tmp.to_pickle(f"{UDDIR}/imd/{dtyp}_{col}_seq.pkl")
#         tmp = None

In [None]:
# gen_id_series(tol_train, "train")

In [None]:
# gen_id_series(tol_test, "test")

## TF-IDF

In [None]:
@timeit
def gen_tfidf(col, nr_max=1, only_get_index=True):
    train_seq = pd.read_pickle(f"{UDDIR}/imd/train_{col}_seq.pkl")
    test_seq = pd.read_pickle(f"{UDDIR}/imd/test_{col}_seq.pkl")
    tol_seq = pd.concat([train_seq, test_seq])
    tol_seq[col] = tol_seq[col].apply(lambda x: " ".join(x))
    
    train_seq = None
    test_seq = None
    if not only_get_index:
        tfidf_enc = TfidfVectorizer(ngram_range=(1, nr_max))
        tfidf_vec = tfidf_enc.fit_transform(tol_seq[col].values)
        log(f"TF-IDF shape: {tfidf_vec.shape}")

        # save sparse matrix
        scipy.sparse.save_npz(f"{UDDIR}/imd/sparse_{col}.npz", tfidf_vec)
    
    return tol_seq.index

### SVD

In [None]:
@timeit
def gen_svd(col, index, prefix="tfidf", n_cpt=64):
    tfidf_vec = scipy.sparse.load_npz(f"{UDDIR}/imd/sparse_{col}.npz")
    if tfidf_vec.shape[1] > n_cpt:
        svd_enc = TruncatedSVD(n_components=n_cpt, n_iter=20, random_state=2020)
        mode_svd = svd_enc.fit_transform(tfidf_vec)
    else:
        n_cpt = tfidf_vec.shape[1]
        mode_svd = tfidf_vec.todense()
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = [f"{prefix}_svd_{col}_{i}" for i in range(n_cpt)]
    mode_svd.index = index
    # save svd pkl
    mode_svd.to_pickle(f"{UFEDIR}/{prefix}_svd_{col}.pkl")

In [None]:
tol_idx_dic = dict()

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tol_idx_dic[col] = copy.deepcopy(gen_tfidf(col))

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_svd(col, tol_idx_dic[col])

### Pure Sparse

In [None]:
sparse_matrix = None
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tfidf_vec = scipy.sparse.load_npz(f"{UDDIR}/imd/sparse_{col}.npz")
    sparse_matrix = scipy.sparse.csr_matrix(scipy.sparse.hstack([sparse_matrix, tfidf_vec]))

## Word2Vec

In [None]:
@timeit
def gen_w2v(col, vesize=200, win=20):
    train_seq = pd.read_pickle(f"{UDDIR}/imd/train_{col}_seq.pkl")
    test_seq = pd.read_pickle(f"{UDDIR}/imd/test_{col}_seq.pkl")
    tol_seq = pd.concat([train_seq, test_seq])
    
    train_seq = None
    test_seq = None
    if os.path.exists(f"{UMDIR}/vectors/w2v_{col}.model"):
        model = gensim.models.Word2Vec.load(f"{UMDIR}/vectors/w2v_{col}.model")
    else:
        model = gensim.models.Word2Vec(sentences=tol_seq[col], size=vesize, window=win, workers=32, sg=0, iter=10)
        model.save(f"{UMDIR}/vectors/w2v_{col}.model")
    
    w2v_list = list()
    
    for it in tqdm(tol_seq[col]):
        tmp = np.zeros(vesize)
        cnt = 0
        for wd in it:
            cnt += 1
            if wd in model:
                tmp += model[wd]
        w2v_list.append(list(tmp/cnt))
    
    w2v_avg = pd.DataFrame(w2v_list)
    w2v_avg.index = tol_seq.index
    w2v_avg.columns = [f"w2v_avg_{col}_{i}" for i in range(vesize)]
    w2v_avg.to_pickle(f"{UFEDIR}/w2v_avg_{col}.pkl")

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_w2v(col)

## Doc2Vec

In [None]:
@timeit
def gen_d2v(col, vesize=200, win=20):
    train_seq = pd.read_pickle(f"{UDDIR}/imd/train_{col}_seq.pkl")
    test_seq = pd.read_pickle(f"{UDDIR}/imd/test_{col}_seq.pkl")
    tol_seq = pd.concat([train_seq, test_seq])
    
    train_seq = None
    test_seq = None
    
    if os.path.exists(f"{UMDIR}/vectors/d2v_{col}.model"):
        model = gensim.models.Doc2Vec.load(f"{UMDIR}/vectors/d2v_{col}.model")
    else:
        docs = [gensim.models.doc2vec.TaggedDocument(words=i[1],tags=[str(i[0])]) for i in tol_seq[col].reset_index().values]
        model = gensim.models.Doc2Vec(documents=docs, size=vesize, window=win, workers=32, iter=10)
        model.save(f"{UMDIR}/vectors/d2v_{col}.model")
    
    d2v_list = list()
    for it, cps in tqdm(tol_seq[col].reset_index().values):
#         if it in model.docvecs:
        d2v_list.append(model.docvecs[str(it)])
#         else:
#             d2v_list.append(model.infer_vector(cps))

    d2v_avg = pd.DataFrame(d2v_list)
    d2v_avg.index = tol_seq.index
    d2v_avg.columns = [f"d2v_avg_{col}_{i}" for i in range(vesize)]
    d2v_avg = reduce_mem_usage(d2v_avg)
    d2v_avg.to_pickle(f"{UFEDIR}/d2v_avg_{col}.pkl")

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_d2v(col)

## Stats features

In [None]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [None]:
del train_click_log, train_ad
del test_click_log, test_ad

### Category map to Label (use GroupFold) - a little overfitting

In [None]:
@timeit
def gen_catemlb(train_data, test_data, tag, col):
    train_use = pd.merge(train_data[[UID, col]], tag, how="left", on=UID)
    test_use = test_data[[UID, col]]
    
    nfolds = 5
    kfold = GroupKFold(n_splits=nfolds)
    
    kf_map_df = pd.DataFrame()

    re_train_use = pd.DataFrame()
    re_test_use = pd.DataFrame()

    for tr_idx, val_idx in kfold.split(train_use, groups=train_use[UID]):
        tr_ucid, val_ucid = train_use.iloc[tr_idx], train_use.iloc[val_idx]
        kf_map = tr_ucid.groupby([col])[["age", "gender"]].agg(["mean"])
        kf_map.columns = bch_rencol(kf_map.columns, prefix=f"{col}_")
        # only use intersect between train and test
        kf_map.drop(np.setdiff1d(kf_map.index.unique(), test_use[col].unique()), inplace=True)
        # get kf_cum
        if kf_map_df.empty:
            kf_map_df = kf_map
        else:
            kf_map_df = (kf_map_df + kf_map).fillna(0)
        
        val_ucid = pd.merge(val_ucid, kf_map, how="left", on=col)
        re_train_use = pd.concat([re_train_use, val_ucid])
    
    kf_map_df = kf_map_df/nfolds
    re_test_use = pd.merge(test_use, kf_map_df, how="left", on=col)

    assert len(train_use) == len(re_train_use)
    assert len(test_use) == len(re_test_use)

    train_use = None
    test_use = None

    tmp = re_train_use.groupby([UID], sort=False)[f"{col}_age_mean", f"{col}_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_stats_{col}_catemlb.pkl")
    tmp = None

    tmp = re_test_use.groupby([UID], sort=False)[f"{col}_age_mean", f"{col}_gender_mean"].agg(["sum", "max", "mean", "min", "std"])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_stats_{col}_catemlb.pkl")
    tmp = None
    

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    gen_catemlb(tol_train, tol_test, train_user, col)

### Group by UID

In [None]:
tm_gid = Timer()

In [None]:
ops_dic = {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std", at_percentile(.75), ],
        "time": ["nunique", at_range, at_lenDnunq, at_lenDrange, ],
}

In [None]:
for col in ops_dic:
    tmp = tol_train.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_stats_{col}.pkl")
    tmp = None

In [None]:
tm_gid.check("Group by UID for train")

In [None]:
for col in ops_dic:
    tmp = tol_test.groupby([UID], sort=False)[[col]].agg(ops_dic[col])
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_stats_{col}.pkl")
    tmp = None

In [None]:
gc.collect()

In [None]:
tm_gid.check("Group by UID for test")

### Group by UID and time

In [None]:
tm_gidt = Timer()

In [None]:
tmp = tol_train.groupby([UID, "time"])[["time"]].agg(["count"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/train_stats_time2{UID}.pkl")
tmp = None

In [None]:
tmp = tol_train.groupby([UID, "time"])[["click_times"]].agg(["sum", "max", "min", "mean", "std"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/train_stats_click_times2{UID}.pkl")
tmp = None

In [None]:
tm_gidt.check("Group by columns to UID for train")

In [None]:
tmp = tol_test.groupby([UID, "time"])[["time"]].agg(["count"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/test_stats_time2{UID}.pkl")
tmp = None

In [None]:
tmp = tol_test.groupby([UID, "time"])[["click_times"]].agg(["sum", "max", "min", "mean", "std"]).groupby([UID]).agg(["sum", "max", "min", "mean", "std"])
tmp.columns = bch_rencol(tmp.columns)
tmp.to_pickle(f"{UFEDIR}/test_stats_click_times2{UID}.pkl")
tmp = None

In [None]:
tm_gidt.check("Group by columns to UID for test")

In [None]:
gc.collect()

### One-Hot (TODO)

In [None]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [None]:
gc.collect()

## Meta Train 

__weak learner prediction as features__

In [None]:
## Once

In [None]:
tol_test.index = -tol_test.index -1

In [None]:
tol_data = pd.concat([tol_train, tol_test])

In [None]:
del tol_train, tol_test
gc.collect()

In [None]:
tol_data[f"{UID}_count"] = tol_data[[UID]].groupby([UID])[UID].transform("count")

In [None]:
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_data[[UID, col]]
    tol_data[f"{UID}_{col}_std"] = tmp.groupby([UID])[col].transform("std")
    tol_data[f"{col}_{UID}_count"] = tmp.groupby([col])[UID].transform("count")
    tol_data[f"{col}_{UID}_std"] = tmp.groupby([col])[UID].transform("std")
    tmp = None

In [None]:
for col in ["time", "click_times"]:
    tmp = tol_data[[UID, col]]
    tol_data[f"{UID}_{col}_std"] = tmp.groupby([UID])[col].transform("std")
    tol_data[f"{UID}_{col}_mean"] = tmp.groupby([UID])[col].transform("mean")
    tmp = None

In [None]:
tol_data = reduce_mem_usage(tol_data)

In [None]:
tol_data.to_pickle(f"{UDDIR}/imd/tol_data.pkl")

In [None]:
## if have tol_data.pkl
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

tol_data = pd.read_pickle(f"{UDDIR}/imd/tol_data.pkl")

In [None]:
def rmse(y_true, y_pred):
    return (mean_squared_error(y_true, y_pred))** .5

### For age

In [None]:
tol_data = pd.merge(tol_data, train_user[[UID, "age"]], how="left", on=UID)

In [None]:
# use df_hist_train df_new_train df_hist_new_train to train 3 models
train_df = tol_data[tol_data["age"].notnull()]
test_df = tol_data[tol_data["age"].isnull()]

In [None]:
drop_features = [UID, "age"]
cat_features = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
feats = [f for f in tol_data.columns if f not in drop_features]

In [None]:
n_splits= 3
folds = GroupKFold(n_splits=n_splits)
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
print ('feats:' + str(len(feats)))

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df["age"],groups=train_df[UID])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df["age"].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df["age"].iloc[valid_idx] 
    
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
               "objective" : "regression", 
               "boosting" : "gbdt", 
               "metric" : "rmse",  
               "max_depth": 7, 
               "num_leaves" : 31, 
               "max_bin" : 255, 
               "learning_rate" : 0.1, 
               "subsample" : 0.8,
               "colsample_bytree" : 0.8, 
               "verbosity": -1,
               "num_threads" : -1,
    }
    

    if n_fold >= 0:
        evals_result = {}
        dtrain = lgb.Dataset(
            train_x, label=train_y,categorical_feature=cat_features)
        dval = lgb.Dataset(
            valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)
        bst = lgb.train(
            params, dtrain, num_boost_round=30000,
            valid_sets=[dval], early_stopping_rounds=100, verbose_eval=20,)#feval = evalerror
        
        new_list = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]
        for item in new_list:
            print (item) 

        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)

        sub_preds += bst.predict(test_df[feats], num_iteration=bst.best_iteration) / folds.n_splits # test_df_new

In [None]:
cv = rmse(train_df["age"],  oof_preds)
print('Full OOF RMSE %.6f' % cv)  

In [None]:
a = train_df[[UID]]
b = test_df[[UID]]

a["age_pred"] = oof_preds
b["age_pred"] = sub_preds

a1 = a.groupby([UID])["age_pred"].agg(["mean", "std", "min", "max", "median", "skew"])
b1 = b.groupby([UID])["age_pred"].agg(["mean", "std", "min", "max", "median", "skew"])

In [None]:
a1.append(b1).add_prefix("age_gkf_agg_pred_").reset_index().sort_values(by=[UID]).reset_index(drop=True).to_pickle(f"{UFEDIR}/meta_age_group_regeress.pkl")

### For gender

In [None]:
tol_data = pd.merge(tol_data, train_user[[UID, "gender"]], how="left", on=UID)

In [None]:
# use df_hist_train df_new_train df_hist_new_train to train 3 models
train_df = tol_data[tol_data["gender"].notnull()]
test_df = tol_data[tol_data["gender"].isnull()]

In [None]:
drop_features = [UID, "gender"]
cat_features = ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]

In [None]:
feats = [f for f in tol_data.columns if f not in drop_features]

In [None]:
n_splits= 3
folds = GroupKFold(n_splits=n_splits)
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
print ('feats:' + str(len(feats)))

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df["gender"],groups=train_df[UID])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df["gender"].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df["gender"].iloc[valid_idx] 
    
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
               "objective" : "regression", 
               "boosting" : "gbdt", 
               "metric" : "rmse",  
               "max_depth": 7, 
               "num_leaves" : 31, 
               "max_bin" : 255, 
               "learning_rate" : 0.1, 
               "subsample" : 0.8,
               "colsample_bytree" : 0.8, 
               "verbosity": -1,
               "num_threads" : -1,
    }
    

    if n_fold >= 0:
        evals_result = {}
        dtrain = lgb.Dataset(
            train_x, label=train_y,categorical_feature=cat_features)
        dval = lgb.Dataset(
            valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)
        bst = lgb.train(
            params, dtrain, num_boost_round=30000,
            valid_sets=[dval], early_stopping_rounds=100, verbose_eval=20,)#feval = evalerror
        
        new_list = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]
        for item in new_list:
            print (item) 

        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)

        sub_preds += bst.predict(test_df[feats], num_iteration=bst.best_iteration) / folds.n_splits # test_df_new

In [None]:
cv = rmse(train_df["gender"],  oof_preds)
print('Full OOF RMSE %.6f' % cv)  

In [None]:
a = train_df[[UID]]
b = test_df[[UID]]

a["gender_pred"] = oof_preds
b["gender_pred"] = sub_preds

a1 = a.groupby([UID])["gender_pred"].agg(["mean", "std", "min", "max", "median", "skew"])
b1 = b.groupby([UID])["gender_pred"].agg(["mean", "std", "min", "max", "median", "skew"])

In [None]:
a1.append(b1).add_prefix("gender_gkf_agg_pred_").reset_index().sort_values(by=[UID]).reset_index(drop=True).to_pickle(f"{UFEDIR}/meta_gender_group_regeress.pkl")

## Concat

In [None]:
# reduce memory (only once)
for fname in feat_fname:
    if fname.startswith("w2v_") or fname.startswith("tfidf_svd_") or fname.startswith("meta_age_"):
        print("current filename: ", fname)
        reduce_mem_usage(pd.read_pickle(f"{UFEDIR}/{fname}")).to_pickle(f"{UFEDIR}/{fname}")

In [15]:
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")
test_user = pd.read_csv(f"{DDIR}/test/user.csv")

In [16]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = test_user[UID]

In [17]:
feat_fname = os.listdir(UFEDIR)

In [18]:
for fname in feat_fname:
    if fname.startswith("w2v_"):
        print("current filename: ", fname)
        cur_w2v = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, cur_w2v, how="left", on=UID)
        test_feat = pd.merge(test_feat, cur_w2v, how="left", on=UID)
        cur_w2v = None

current filename:  w2v_avg_advertiser_id.pkl
current filename:  w2v_avg_industry.pkl
current filename:  w2v_avg_ad_id.pkl
current filename:  w2v_avg_product_id.pkl
current filename:  w2v_avg_creative_id.pkl
current filename:  w2v_avg_product_category.pkl


In [None]:
# not use now
for fname in feat_fname:
    if fname.startswith("train_stats_"):
        print("current filename: ", fname)
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_stats_"):
        print("current filename: ", fname)
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

In [19]:
for fname in feat_fname:
    if fname.startswith("tfidf_svd_"):
        print("current filename: ", fname)
        cur_tfidf_svd = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, cur_tfidf_svd, how="left", on=UID)
        test_feat = pd.merge(test_feat, cur_tfidf_svd, how="left", on=UID)
        cur_tfidf_svd = None

current filename:  tfidf_svd_industry.pkl
current filename:  tfidf_svd_advertiser_id.pkl
current filename:  tfidf_svd_product_category.pkl
current filename:  tfidf_svd_creative_id.pkl
current filename:  tfidf_svd_product_id.pkl
current filename:  tfidf_svd_ad_id.pkl


In [20]:
for fname in feat_fname:
    if fname.startswith("meta_age_"):
        print("current filename: ", fname)
        age_agg_pred = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, age_agg_pred, how="left", on=UID)
        test_feat = pd.merge(test_feat, age_agg_pred, how="left", on=UID)
        age_agg_pred = None

current filename:  meta_age_group_regeress.pkl


In [21]:
for fname in feat_fname:
    if fname.startswith("d2v_"):
        print("current filename: ", fname)
        cur_d2v = pd.read_pickle(f"{UFEDIR}/{fname}")
        train_feat = pd.merge(train_feat, cur_d2v, how="left", on=UID)
        test_feat = pd.merge(test_feat, cur_d2v, how="left", on=UID)
        cur_d2v = None

current filename:  d2v_avg_product_id.pkl
current filename:  d2v_avg_ad_id.pkl
current filename:  d2v_avg_advertiser_id.pkl
current filename:  d2v_avg_industry.pkl
current filename:  d2v_avg_product_category.pkl
current filename:  d2v_avg_creative_id.pkl


In [22]:
# to make sure feat and user(target) have same order
# if true --> sum == 0
np.sum(train_feat[UID] != train_user[UID])

0

In [40]:
train_feat.shape

(900000, 2744)

In [41]:
test_feat.shape

(1000000, 2744)

In [27]:
res = test_feat[[UID]]

In [28]:
res.shape

(1000000, 1)

In [25]:
train_feat.memory_usage().sum() / 1024**2 

4724.12109375

In [26]:
test_feat.memory_usage().sum() / 1024**2 

5249.0234375

In [29]:
gc.collect()

0

In [None]:
# train_feat.to_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v05.pkl")
# test_feat.to_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v05.pkl")

In [None]:
# train_feat = pd.read_pickle(f"{UDDIR}/feat_ing/train_feat_tol_v05.pkl")
# test_feat = pd.read_pickle(f"{UDDIR}/feat_ing/test_feat_tol_v05.pkl")
# train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [None]:
# train_feat.drop([col for col in train_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# test_feat.drop([col for col in test_feat.columns if col.find("creative_id_gender_") != -1], axis=1, inplace=True)

In [None]:
# list(train_feat.columns)

In [None]:
res.shape

# Training&Prediction

In [30]:
train_feat.drop(UID, axis=1, inplace=True)
test_feat.drop(UID, axis=1, inplace=True)

## LightGBM

In [None]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_user, test_size=0.2, random_state=2020)

In [None]:
train_feat_tr.drop(UID, axis=1, inplace=True)

In [None]:
train_feat_val.drop(UID, axis=1, inplace=True)

In [None]:
gc.collect()

## For Age

### Offline

In [None]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr, train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val, train_tag_val["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,     
    
}

In [None]:
model_lgb_multi_age_off = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_age_prob = model_lgb_multi_age_off.predict(train_feat_val, num_iteration=model_lgb_multi_age_off.best_iteration)
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

### Online

In [None]:
lgbds_train_age = lgb.Dataset(train_feat, train_user["age"]-1)

In [None]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,     
    
}

In [None]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_age, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_age.save_model(f"{UMDIR}/lgb_multi_age_{ndt}.model")

### KFold

In [49]:
tm_mage = Timer()

In [50]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [51]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
    
}

In [52]:
scores = []
result_proba = []
age_feat_import_df = pd.DataFrame

In [53]:
for cur_fold, (tr_idx, val_idx) in enumerate(kfold.split(train_feat, train_user["age"]-1)):
    tr_x = tr_y = val_x = val_y = None
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["age"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["age"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_age, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    # save feature importance
    tmp = pd.DataFrame(lgb_model.feature_name(), columns=["feature_name"])
    tmp[f"feature_importance_fold{cur_fold}"] = lgb_model.feature_importance()
    if age_feat_import_df.empty:
        age_feat_import_df = tmp
    else:
        age_feat_import_df = pd.merge(age_feat_import_df, tmp, how="left", on="feature_name")
    # get valid pred
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = accuracy_score(val_pred, val_y)
    # get test pred
    result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
    print("current validation score: ", val_score)
    gc.collect()
print("accuracy score: ", np.mean(scores))

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 1.5947	valid_0's multi_error: 0.639867
[100]	valid_0's multi_logloss: 1.52649	valid_0's multi_error: 0.621544
[150]	valid_0's multi_logloss: 1.49443	valid_0's multi_error: 0.611422
[200]	valid_0's multi_logloss: 1.47452	valid_0's multi_error: 0.60565
[250]	valid_0's multi_logloss: 1.46076	valid_0's multi_error: 0.600861
[300]	valid_0's multi_logloss: 1.4505	valid_0's multi_error: 0.597217
[350]	valid_0's multi_logloss: 1.44274	valid_0's multi_error: 0.595122
[400]	valid_0's multi_logloss: 1.43658	valid_0's multi_error: 0.593644
[450]	valid_0's multi_logloss: 1.43152	valid_0's multi_error: 0.591378
[500]	valid_0's multi_logloss: 1.42752	valid_0's multi_error: 0.5901
[550]	valid_0's multi_logloss: 1.42403	valid_0's multi_error: 0.588744
[600]	valid_0's multi_logloss: 1.42146	valid_0's multi_error: 0.588372
[650]	valid_0's multi_logloss: 1.41933	valid_0's multi_error: 0.587667
[700]	valid_0's multi

[1000]	valid_0's multi_logloss: 1.41085	valid_0's multi_error: 0.582978
Early stopping, best iteration is:
[936]	valid_0's multi_logloss: 1.41162	valid_0's multi_error: 0.581967
current validation score:  0.4180333333333333
accuracy score:  0.4148588888888889


In [54]:
predicted_age = np.argmax(np.mean(result_proba, axis=0), axis=1) + 1

In [55]:
tm_mage.check("K-Fold train costs")

[K-Fold train costs] spend 24908.36 sec


In [None]:
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.649072
[100]	valid_0's multi_error: 0.632817
[150]	valid_0's multi_error: 0.623339
[200]	valid_0's multi_error: 0.617433
[250]	valid_0's multi_error: 0.613378
[300]	valid_0's multi_error: 0.611011
[350]	valid_0's multi_error: 0.609172
[400]	valid_0's multi_error: 0.606422
[450]	valid_0's multi_error: 0.605522
[500]	valid_0's multi_error: 0.604956
[550]	valid_0's multi_error: 0.603811
[600]	valid_0's multi_error: 0.603089
[650]	valid_0's multi_error: 0.602733
[700]	valid_0's multi_error: 0.601961
[750]	valid_0's multi_error: 0.601689
[800]	valid_0's multi_error: 0.601744
[850]	valid_0's multi_error: 0.601217
[900]	valid_0's multi_error: 0.600606
[950]	valid_0's multi_error: 0.600294
[1000]	valid_0's multi_error: 0.600333
[1050]	valid_0's multi_error: 0.600011
[1100]	valid_0's multi_error: 0.600339
Early stopping, best iteration is:
[1035]	valid_0's multi_error: 0.599733
current validation score:  0.40026666666666666
Training until validation scores don't improve for 100 rounds

## For Gender

### Offline

In [None]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr, train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val, train_tag_val["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [None]:
model_lgb_multi_gender_off = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

In [None]:
train_val_gender_prob = model_lgb_multi_gender_off.predict(train_feat_val, num_iteration=model_lgb_multi_gender_off.best_iteration)
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

### Online

In [None]:
lgbds_train_gender = lgb.Dataset(train_feat, train_user["gender"]-1)

In [None]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [None]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_gender, num_boost_round=1000, verbose_eval=50)

In [None]:
ndt = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
model_lgb_multi_gender.save_model(f"{UMDIR}/lgb_multi_gender_{ndt}.model")

### KFold

In [42]:
tm_mgender = Timer()

In [43]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [44]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": ["multi_logloss", "multi_error"],
    "learning_rate": 0.1,
    "seed": 2020,
    "n_jobs": -1,
    "min_child_weight": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
}

In [45]:
gender_scores = []
gender_result_proba = []
gender_feat_import_df = pd.DataFrame()

In [46]:
for cur_fold, (tr_idx, val_idx) in enumerate(kfold.split(train_feat, train_user["gender"]-1)):
    tr_x = tr_y = val_x = val_y = None
    tr_x, tr_y, val_x, val_y = train_feat.iloc[tr_idx], train_user["gender"].iloc[tr_idx]-1, train_feat.iloc[val_idx], train_user["gender"].iloc[val_idx]-1
    train_set = lgb.Dataset(tr_x, tr_y)
    val_set = lgb.Dataset(val_x, val_y)
    lgb_model = lgb.train(params_gender, train_set,
                          valid_sets=[val_set], early_stopping_rounds=100, num_boost_round=40000, verbose_eval=50)
    # save feature importance
    tmp = pd.DataFrame(lgb_model.feature_name(), columns=["feature_name"])
    tmp[f"feature_importance_fold{cur_fold}"] = lgb_model.feature_importance()
    if gender_feat_import_df.empty:
        gender_feat_import_df = tmp
    else:
        gender_feat_import_df = pd.merge(gender_feat_import_df, tmp, how="left", on="feature_name")
    # get valid pred
    val_pred = np.argmax(lgb_model.predict(
        val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = accuracy_score(val_pred, val_y)
    # get test pred
    gender_result_proba.append(lgb_model.predict(
        test_feat, num_iteration=lgb_model.best_iteration))
    gender_scores.append(val_score)
    log(f"current validation score: {val_score}")
    gc.collect()
log(f"accuracy score: {np.mean(gender_scores)}")

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.236098	valid_0's multi_error: 0.0866611
[100]	valid_0's multi_logloss: 0.208996	valid_0's multi_error: 0.0774611
[150]	valid_0's multi_logloss: 0.199907	valid_0's multi_error: 0.0736611
[200]	valid_0's multi_logloss: 0.194856	valid_0's multi_error: 0.0719778
[250]	valid_0's multi_logloss: 0.191612	valid_0's multi_error: 0.0701944
[300]	valid_0's multi_logloss: 0.18943	valid_0's multi_error: 0.0693111
[350]	valid_0's multi_logloss: 0.187819	valid_0's multi_error: 0.0686
[400]	valid_0's multi_logloss: 0.186607	valid_0's multi_error: 0.0680889
[450]	valid_0's multi_logloss: 0.185802	valid_0's multi_error: 0.0675611
[500]	valid_0's multi_logloss: 0.185229	valid_0's multi_error: 0.0675556
[550]	valid_0's multi_logloss: 0.184861	valid_0's multi_error: 0.0673
[600]	valid_0's multi_logloss: 0.184523	valid_0's multi_error: 0.0672722
[650]	valid_0's multi_logloss: 0.184291	valid_0's multi_error: 0.06707

In [47]:
predicted_gender = np.argmax(np.mean(gender_result_proba, axis=0), axis=1) + 1

In [48]:
tm_mgender.check("K-Fold train costs")

[K-Fold train costs] spend 4706.51 sec


In [None]:
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.0912667
[100]	valid_0's multi_error: 0.08375
[150]	valid_0's multi_error: 0.0806111
[200]	valid_0's multi_error: 0.07855
[250]	valid_0's multi_error: 0.0770611
[300]	valid_0's multi_error: 0.07595
[350]	valid_0's multi_error: 0.0752611
[400]	valid_0's multi_error: 0.0747556
[450]	valid_0's multi_error: 0.0741667
[500]	valid_0's multi_error: 0.0737611
[550]	valid_0's multi_error: 0.0736278
[600]	valid_0's multi_error: 0.0735278
[650]	valid_0's multi_error: 0.0733167
[700]	valid_0's multi_error: 0.0730722
[750]	valid_0's multi_error: 0.0729056
[800]	valid_0's multi_error: 0.0729278
[850]	valid_0's multi_error: 0.0728111
[900]	valid_0's multi_error: 0.0726611
[950]	valid_0's multi_error: 0.0726889
[1000]	valid_0's multi_error: 0.0726556
[1050]	valid_0's multi_error: 0.0725389
[1100]	valid_0's multi_error: 0.0724444
[1150]	valid_0's multi_error: 0.0723111
[1200]	valid_0's multi_error: 0.0722722
Early stopping, best iteration is:
[1149]	valid_0's multi_error: 0.07225
current validation score: 0.92775

In [None]:
np.array(gender_scores) + np.array(scores)

## NN (TODO)

# Generate Prediction Result

In [None]:
# model_lgb_multi_age = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_age_20200511045531.model")

In [None]:
# model_lgb_multi_gender = lgb.Booster(model_file=f"{UMDIR}/lgb_multi_gender_20200511034408.model")

In [None]:
lgb.plot_importance(model_lgb_multi_gender, max_num_features=10)

In [None]:
lgb.plot_importance(model_lgb_multi_age, max_num_features=10)

In [56]:
res["predicted_age"] = predicted_age
# res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat, num_iteration=model_lgb_multi_age.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
res["predicted_gender"] = predicted_gender
# res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat, num_iteration=model_lgb_multi_gender.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

In [58]:
res["predicted_age"].value_counts()

3     303724
2     164119
4     144075
5     142473
6     118908
7      54847
1      23404
8      22225
9      16292
10      9933
Name: predicted_age, dtype: int64

In [59]:
res["predicted_gender"].value_counts()

1    679154
2    320846
Name: predicted_gender, dtype: int64

In [None]:
res.shape

In [61]:
tmp = pd.read_csv(f"{RESDIR}/res-20200516155349.csv")

In [62]:
tmp["predicted_age"].value_counts()

3     305843
2     165322
4     141417
5     140297
6     116020
7      55586
1      24162
8      23163
9      17309
10     10881
Name: predicted_age, dtype: int64

In [63]:
tmp["predicted_gender"].value_counts()

1    678320
2    321680
Name: predicted_gender, dtype: int64

In [None]:
gc.collect()

# Cent result to COS

In [None]:
from ti import session
ti_session = session.Session()

In [None]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200515004850.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)