In [1]:
# # if used pip install package
# !pip install xgboost
# !pip install lightgbm
# !pip install wget

In [2]:
import os
import sys
import gc
import wget
import time
import tarfile
import zipfile
import random

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb

from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
DDIR = "data"
UDDIR = "user_data"
UFEDIR = "user_data/feat_data_v01"
RESDIR = "prediction_result"

In [4]:
UID = "user_id"

# Load data (Only once)

In [5]:
# train_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/train_preliminary.zip", out=DDIR)
# test_fname = wget.download("https://tesla-ap-shanghai-1256322946.cos.ap-shanghai.myqcloud.com/cephfs/tesla_common/deeplearning/dataset/algo_contest/test.zip", out=DDIR)

In [6]:
# def myunzip(filename):
#     zFile = zipfile.ZipFile(filename, "r")
#     for fileM in zFile.namelist(): 
#         zFile.extract(fileM, DDIR)
#         print(fileM)
#     zFile.close()

In [7]:
# myunzip(train_fname)
# myunzip(test_fname)

# Utils

In [10]:
def bch_rencol(values, prefix="", suffix=""):
    return list(map(lambda x: f"{prefix}"+"_".join(list(map(lambda y: str(y), x)))+f"{suffix}" 
                    if hasattr(x, "__iter__") and not isinstance(x, str) 
                    else f"{prefix}"+str(x)+f"{suffix}", values))

In [11]:
def mynunique(values):
    return values.nunique(dropna=False)

In [12]:
def getidxmax(x):
    return x.idxmax()[1]

In [14]:
OP_SET = ["max", "mean", "std", getidxmax, "nunique"]

# Data Exploration (todo)

In [15]:
# read train data
train_click_log = pd.read_csv(f"{DDIR}/train_preliminary/click_log.csv")
train_ad = pd.read_csv(f"{DDIR}/train_preliminary/ad.csv")
# tag
train_user = pd.read_csv(f"{DDIR}/train_preliminary/user.csv")

In [16]:
# read test data
test_click_log = pd.read_csv(f"{DDIR}/test/click_log.csv")
test_ad = pd.read_csv(f"{DDIR}/test/ad.csv")

In [17]:
# train_ad["product_id"] = train_ad["product_id"].replace({"\\N": np.nan})
# test_ad["product_id"] = test_ad["product_id"].replace({"\\N": np.nan})

In [28]:
train_click_log.shape

(30082771, 4)

In [29]:
train_ad.shape

(2481135, 6)

In [30]:
train_user.shape

(900000, 3)

In [31]:
test_click_log.shape

(33585512, 4)

In [32]:
test_ad.shape

(2618159, 6)

In [20]:
train_ad["product_id"] = train_ad["product_id"].replace("\\N", -1).astype(int)
train_ad["industry"] = train_ad["industry"].replace("\\N", -1).astype(int)

In [21]:
test_ad["product_id"] = test_ad["product_id"].replace("\\N", -1).astype(int)
test_ad["industry"] = test_ad["industry"].replace("\\N", -1).astype(int)

In [26]:
train_ad.count()

creative_id         2481135
ad_id               2481135
product_id          2481135
product_category    2481135
advertiser_id       2481135
industry            2481135
dtype: int64

In [27]:
test_ad.count()

creative_id         2618159
ad_id               2618159
product_id          2618159
product_category    2618159
advertiser_id       2618159
industry            2618159
dtype: int64

# Feature engineering

In [33]:
tol_train = pd.merge(train_click_log, train_ad, how="left", on="creative_id")
tol_test = pd.merge(test_click_log, test_ad, how="left", on="creative_id")

In [38]:
del train_click_log, train_ad
del test_click_log, test_ad

## User Click log (Order 1)

In [16]:
# for train
tmp = tol_train.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
)

In [17]:
tmp.columns = bch_rencol(tmp.columns)

In [18]:
tmp.to_pickle(f"{UFEDIR}/train_o1.pkl")
tmp = None

In [19]:
# for test
tmp = tol_test.groupby([UID], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
)

In [20]:
tmp.columns = bch_rencol(tmp.columns)

In [21]:
tmp.to_pickle(f"{UFEDIR}/test_o1.pkl")
tmp = None

In [22]:
gc.collect()

11

## User Click log (Order 2)

In [23]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_train.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/train_o2_{col}.pkl")
    tmp = None

In [24]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_test.groupby([UID, col], sort=False)[[col]].agg(["count"]).groupby([UID]).agg(OP_SET)
    tmp.columns = bch_rencol(tmp.columns)
    tmp.to_pickle(f"{UFEDIR}/test_o2_{col}.pkl")
    tmp = None

In [25]:
gc.collect()

0

## One-Hot

In [26]:
# train
tmp = tol_train.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [27]:
tmp.to_pickle(f"{UFEDIR}/train_onehot.pkl")
tmp = None

In [28]:
# test
tmp = tol_test.groupby([UID, "product_category"], sort=False)[["product_category"]].agg(["count"]).unstack().fillna(0)
tmp.columns = bch_rencol(tmp.columns)

In [29]:
tmp.to_pickle(f"{UFEDIR}/test_onehot.pkl")
tmp = None

In [30]:
gc.collect()

0

## Time Windows (Time Bins)

In [39]:
bins = 9
tol_bins = pd.cut(pd.concat([tol_train["time"], tol_test["time"]]), bins, labels=range(bins))

In [40]:
tol_train[f"bins{bins}"] = tol_bins[:len(tol_train)]
tol_test[f"bins{bins}"] = tol_bins[len(tol_train):]

### Order 1

In [None]:
# train
tmp = tol_train.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o1.pkl")
tmp = None

In [None]:
# test
tmp = tol_test.groupby([UID, f"bins{bins}"], sort=False).agg(
    {
        UID: ["count"], 
        "click_times": ["sum", "max", "mean", "std"],
        "time": [mynunique, "mean", "max", "min"],
        "creative_id": [mynunique],
        "ad_id": [mynunique],
        "product_id": [mynunique],
        "product_category": [mynunique],
        "advertiser_id": [mynunique],
        "industry": [mynunique],
    }
).unstack()

In [None]:
tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")

In [None]:
tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o1.pkl")
tmp = None

In [None]:
gc.collect()

### Order 2

In [None]:
# train
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_train.groupby([UID, f"bins{bins}", col], sort=False)[[col]].agg(["count"]).groupby([UID, f"bins{bins}"]).agg(OP_SET).unstack()
    tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")
    tmp.to_pickle(f"{UFEDIR}/train_bins{bins}_o2_{col}.pkl")
    tmp = None

In [None]:
# test
for col in ["creative_id", "ad_id", "product_id", "product_category", "advertiser_id", "industry"]:
    tmp = tol_test.groupby([UID, f"bins{bins}", col], sort=False)[[col]].agg(["count"]).groupby([UID, f"bins{bins}"]).agg(OP_SET).unstack()
    tmp.columns = bch_rencol(tmp.columns, prefix=f"bins{bins}_")
    tmp.to_pickle(f"{UFEDIR}/test_bins{bins}_o2_{col}.pkl")
    tmp = None

In [None]:
gc.collect()

## Conact

In [None]:
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()
train_feat[UID] = train_user[UID]
test_feat[UID] = np.sort(test_click_log[UID].unique())

In [None]:
feat_fname = os.listdir(UFEDIR)

In [None]:
for fname in feat_fname:
    if fname.startswith("train_"):
        train_feat = pd.merge(train_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)
    elif fname.startswith("test_"):
        test_feat = pd.merge(test_feat, pd.read_pickle(f"{UFEDIR}/{fname}"), how="left", on=UID)

In [None]:
# to make sure feat and user(target) have same order
train_all = pd.merge(train_feat, train_user, how="left", on=UID)
train_tag = train_all[[UID, "age", "gender"]]
train_feat = train_all.drop(["age", "gender"], axis=1)

# Training&Prediction

In [120]:
# split data
train_feat_tr, train_feat_val, train_tag_tr, train_tag_val = train_test_split(train_feat, train_tag, test_size=0.2)

## For Age

In [121]:
lgbds_train_tr_age = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["age"]-1)
lgbds_train_val_age = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["age"]-1)

In [122]:
params_age = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 10,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [123]:
model_lgb_multi_age = lgb.train(params_age, lgbds_train_tr_age, num_boost_round=1000, valid_sets=[lgbds_train_val_age], verbose_eval=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.757944
[100]	valid_0's multi_error: 0.752339


KeyboardInterrupt: 

In [None]:
# offline

In [None]:
train_val_age_prob = model_lgb_multi_age.predict(train_feat_val, num_iteration=model_lgb_multi_age.best_iteration)

In [None]:
train_val_age_pred = [list(x).index(max(x))+1 for x in train_val_age_prob]

In [None]:
age_acy = accuracy_score(train_val_age_pred, train_tag_val["age"])

In [None]:
age_acy = pd.merge(train_val)

## For Gender

In [124]:
lgbds_train_tr_gender = lgb.Dataset(train_feat_tr.drop(UID, axis=1), train_tag_tr["gender"]-1)
lgbds_train_val_gender = lgb.Dataset(train_feat_val.drop(UID, axis=1), train_tag_val["gender"]-1)

In [125]:
params_gender = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 2,
    "metric": "multi_error",
    "learning_rate": 0.1,
}

In [126]:
model_lgb_multi_gender = lgb.train(params_gender, lgbds_train_tr_gender, num_boost_round=1000, valid_sets=[lgbds_train_val_gender], verbose_eval=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_error: 0.278417
[100]	valid_0's multi_error: 0.264978
[150]	valid_0's multi_error: 0.259583
[200]	valid_0's multi_error: 0.257
[250]	valid_0's multi_error: 0.255411
[300]	valid_0's multi_error: 0.254822
[350]	valid_0's multi_error: 0.253806
[400]	valid_0's multi_error: 0.253461
[450]	valid_0's multi_error: 0.252733
[500]	valid_0's multi_error: 0.251978
[550]	valid_0's multi_error: 0.251672
[600]	valid_0's multi_error: 0.2517
[650]	valid_0's multi_error: 0.251228
[700]	valid_0's multi_error: 0.250956
[750]	valid_0's multi_error: 0.250778
[800]	valid_0's multi_error: 0.250667
[850]	valid_0's multi_error: 0.25045
[900]	valid_0's multi_error: 0.250167
[950]	valid_0's multi_error: 0.250178
[1000]	valid_0's multi_error: 0.250128
Did not meet early stopping. Best iteration is:
[967]	valid_0's multi_error: 0.250072


In [None]:
# offline

In [129]:
train_val_gender_prob = model_lgb_multi_gender.predict(train_feat_val.drop(UID, axis=1), num_iteration=model_lgb_multi_gender.best_iteration)

In [130]:
train_val_gender_pred = [list(x).index(max(x))+1 for x in train_val_gender_prob]

In [131]:
gender_acy = accuracy_score(train_val_gender_pred, train_tag_val["gender"])

0.7499277777777777

In [None]:
# total accuracy
age_acy + gender_acy

# Generate Prediction Result

In [88]:
res = test_feat[[UID]]

In [90]:
res["predicted_age"] = [list(x).index(max(x))+1 for x in model_lgb_multi_age.predict(test_feat.drop("user_id", axis=1), num_iteration=model_lgb_multi_age.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [91]:
res["predicted_gender"] = [list(x).index(max(x))+1 for x in model_lgb_multi_gender.predict(test_feat.drop("user_id", axis=1), num_iteration=model_lgb_multi_gender.best_iteration)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [92]:
res_suffix = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
res.to_csv(f"{RESDIR}/res-{res_suffix}.csv", index=False)

# Cent result to COS

In [93]:
from ti import session
ti_session = session.Session()

In [94]:
inputs = ti_session.upload_data(path=f"{RESDIR}/res-20200509035457.csv", bucket="etveritas-1252104022", key_prefix=RESDIR)