## Prepare and read data

In [1]:
import gc
import itertools
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

In [2]:
pd.set_option("max_columns", 300)
pd.set_option("max_rows", 500)

In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [4]:
def feature_cache(reset=False):
    def _feature_cache(func):
        def wrapper(train_df, test_df, *args):
            func_name = func.__name__
            train_feat_path = Path("../feature") / f"train_{func_name}.pkl"
            test_feat_path = Path("../feature") / f"test_{func_name}.pkl"
            # if feature exists, load feature
            if train_feat_path.exists() and test_feat_path.exists() and not reset:
                train_feats = pd.read_pickle(train_feat_path).reset_index(drop=True)
                test_feats = pd.read_pickle(test_feat_path).reset_index(drop=True)
                train_df = pd.concat([train_df, train_feats], axis=1)
                test_df = pd.concat([test_df, test_feats], axis=1)
            # if not exists, make feature and save as pickle
            else:
                before_cols = train_df.columns.tolist()
                train_df, test_df = func(train_df, test_df, *args)
                after_cols = train_df.columns.tolist()
                new_cols = [c for c in after_cols if c not in before_cols]
                train_feats = train_df[new_cols]
                test_feats = test_df[new_cols]
                train_feats.to_pickle(train_feat_path)
                test_feats.to_pickle(test_feat_path)            
            return train_df, test_df
        return wrapper

    return _feature_cache

## Feature Engineering

In [5]:
def get_multi_cat_cols(train_df):
    tmp = train_df.iloc[:1000]
    multi_cols = []
    for c in train_df.columns:
        sep_num = tmp[c].astype(str).fillna("").str.contains(";").sum()
        if sep_num > 10:
            multi_cols.append(c)
    return multi_cols

In [6]:
original_cols = train_df.columns.tolist()

In [7]:
multi_cat_cols = get_multi_cat_cols(train_df)

In [8]:
nume_cols = [
    c for c in list(np.setdiff1d(original_cols, multi_cat_cols))
    if c not in ["Salary", "No"] and "float" in train_df[c].dtype.name
]

In [9]:
cat_cols = [c for c in train_df.columns if c not in multi_cat_cols + nume_cols + ["Salary", "No"]]

In [10]:
non_basic_nume_cols = []
non_basic_cat_cols = []

In [11]:
len(original_cols), len(cat_cols), len(nume_cols), len(multi_cat_cols)

(128, 65, 40, 21)

In [12]:
def hand_feature(df):
    df["ohe_main_cluster"] = 1
    df.loc[(df.DevType.fillna("").str.contains("Student"))
                | (df.SalaryType != "Yearly")
                | (df["Age"] == "Under 18 years old")
                | (df["CurrencySymbol"] == "INR"),
           "ohe_main_cluster"
    ] = 0
    return df

In [13]:
train_df = hand_feature(train_df)
test_df = hand_feature(test_df)

### rank feature

In [14]:
def add_rank_feature(df):
    rank_prefix_list = [
        "AssessBenefits",
        "AssessJob",
        "JobContactPriorities",
        "JobEmailPriorities",
        "AdsPriorities",
    ]
    for prefix in tqdm(rank_prefix_list):
        rank_cols = [c for c in df.columns if prefix in c]
        col_pairs = itertools.combinations(rank_cols, 2)
        for col_a, col_b in col_pairs:
            df[f"rank_diff_{prefix}_{col_a}_{col_b}"] = (df[col_a] - df[col_b]) / np.log2(df[[col_a, col_b]].max(axis=1))
    return df

In [15]:
train_df = add_rank_feature(train_df)
test_df = add_rank_feature(test_df)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### Multi-category encoding

In [16]:
for c in tqdm(multi_cat_cols):
    binarizer = MultiLabelBinarizer()
    train_multi_srs = train_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    test_multi_srs = test_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    train_arr = binarizer.fit_transform(train_multi_srs)
    test_arr = binarizer.transform(test_multi_srs)
    feat_cols = [f"ohe_{c}_{val}" for val in binarizer.classes_]
    train_feat_df = pd.DataFrame(train_arr, columns=feat_cols, dtype=np.int8)
    test_feat_df = pd.DataFrame(test_arr, columns=feat_cols, dtype=np.int8)
    all_feat_df = pd.concat([train_feat_df, test_feat_df], axis=0, ignore_index=True)
    train_feat_df[f"sum_answer_{c}"] = (train_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
    test_feat_df[f"sum_answer_{c}"] = (test_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
    train_df = pd.concat([train_df, train_feat_df], axis=1)
    test_df = pd.concat([test_df, test_feat_df], axis=1)
    # ohe_featureはcategoryとnumerical両方として扱う
    nume_cols += feat_cols
    cat_cols += feat_cols
    # non_basic_nume_cols.append(f"sum_answer_{c}")
    # SVD
    svd = TruncatedSVD(n_components=2, random_state=2020)
    all_svd_feats = pd.DataFrame(svd.fit_transform(all_feat_df), columns=[f"svd_{c}_{ix}" for ix in range(2)])
    train_df = pd.concat([train_df, all_svd_feats.iloc[:len(train_df)]], axis=1)
    test_df = pd.concat([test_df, all_svd_feats.iloc[len(train_df):].reset_index(drop=True)], axis=1)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




### Simple-category encoding

In [17]:
for c in cat_cols:
    train_df[c], uniques = pd.factorize(train_df[c], sort=True)
    test_df[c] = uniques.get_indexer(test_df[c])

### Reduce data memory

In [18]:
for c in nume_cols:
    if train_df[c].nunique() > 1000:
        train_df[c] = train_df[c].astype(np.float32)
        test_df[c] = test_df[c].astype(np.float32)
    else:
        train_df[c] = train_df[c].astype(np.float16)
        test_df[c] = test_df[c].astype(np.float16)

In [19]:
for c in cat_cols:
    if train_df[c].max() > 32767:
        train_df[c] = train_df[c].astype(np.int32)
        test_df[c] = test_df[c].astype(np.int32)
    else:
        train_df[c] = train_df[c].astype(np.int16)
        test_df[c] = test_df[c].astype(np.int16)            

In [20]:
train_df.shape, test_df.shape

((33857, 660), (11259, 659))

### Change Category Name

In [21]:
train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]
test_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_df.columns]

### Aggregation

In [22]:
def get_basic_importance_cols(use_num=50):
    # basicなモデルのimportanceを読み込み
    importance_df = pd.read_csv("../data/importance/003_importance.csv")
    imp_feats = importance_df["feature"].iloc[:use_num].tolist()
    return imp_feats

In [23]:
def make_agg_feature(train_df, test_df):
    imp_feats = get_basic_importance_cols(use_num=50)
    imp_cat_cols = [c for c in cat_cols if c in imp_feats] + non_basic_cat_cols
    imp_nume_cols = [c for c in nume_cols if c in imp_feats] + non_basic_nume_cols
    print(f"use cat col: {len(imp_cat_cols)}  nume col: {len(imp_nume_cols)}")
    #imp_nume_cols += [c for c in train_df.columns if c[:8] == "sum_answ"]
    all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    for cat_col in tqdm(imp_cat_cols):
        for nume_col in imp_nume_cols:
            # one-hotは同じカテゴリの場合がある
            if cat_col == nume_col:
                continue
            all_df[f"agg_mean_{cat_col}_{nume_col}"] = \
                all_df.groupby(cat_col)[nume_col].transform("mean").astype(np.float32)
            all_df[f"agg_std_{cat_col}_{nume_col}"] = \
                all_df.groupby(cat_col)[nume_col].transform("std").astype(np.float32)
            all_df[f"agg_max_{cat_col}_{nume_col}"] = \
                all_df.groupby(cat_col)[nume_col].transform("max").astype(np.float32)
            all_df[f"agg_min_{cat_col}_{nume_col}"] = \
                all_df.groupby(cat_col)[nume_col].transform("min").astype(np.float32)
            all_df[f"diff_agg_mean_{cat_col}_{nume_col}"] = \
                all_df[nume_col] - all_df[f"agg_mean_{cat_col}_{nume_col}"]
            all_df[f"rel_agg_mean_{cat_col}_{nume_col}"] = \
                all_df[nume_col] / (1 + all_df[f"agg_mean_{cat_col}_{nume_col}"])
    train_df = all_df.iloc[:len(train_df)].reset_index(drop=True)
    test_df = all_df.iloc[len(train_df):].reset_index(drop=True)
    del all_df
    gc.collect()
    return train_df, test_df

In [24]:
train_df, test_df = make_agg_feature(train_df, test_df)

use cat col: 27  nume col: 10


HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




### Make fold index(for target encoding)

In [25]:
folds = KFold(n_splits=5, random_state=2020, shuffle=True)
fold_idx_list = [(trn_idx, val_idx) for trn_idx, val_idx in folds.split(train_df, train_df["Salary"])]

### Target Encoding

In [26]:
@feature_cache(reset=False)
def target_encoding(train_df, test_df):
    te_cols = [c for c in train_df.columns if c in cat_cols]
    for c in tqdm(te_cols):
        new_col = "te_" + c
        train_df[new_col] = 0
        test_df[new_col] = 0
        for trn_idx, val_idx in fold_idx_list:
            mean_val = train_df.loc[trn_idx].groupby(c)["Salary"].mean().astype(np.float32)
            train_df.loc[val_idx, new_col] = train_df.loc[val_idx, c].map(mean_val)
            test_df.loc[:, new_col] += test_df.loc[:, c].map(mean_val) / len(fold_idx_list)
        train_df[new_col] = train_df[new_col].astype(np.float32)
        test_df[new_col] = test_df[new_col].astype(np.float32)
    return train_df, test_df

In [27]:
train_df, test_df = target_encoding(train_df, test_df)

In [28]:
@feature_cache(reset=False)
def multiple_target_encoding(train_df, test_df):
     # multiple target encoding
    multi_te_cols = [c for c in train_df.columns if c in cat_cols or c[:4] == "ohe_"]
    imp_feats = get_basic_importance_cols(use_num=30)
    multi_te_cols = [c for c in multi_te_cols if c in imp_feats]
    combi_multi_te_cols = list(itertools.combinations(multi_te_cols, 2))   

    for col_a, col_b in tqdm(combi_multi_te_cols):
        new_col = "te_" + col_a + "__" + col_b
        train_df[new_col] = 0
        test_df[new_col] = 0
        train_df["tmp"] = train_df[col_a].fillna("").astype(str) + train_df[col_b].fillna("").astype(str)
        test_df["tmp"] = test_df[col_a].fillna("").astype(str) + test_df[col_b].fillna("").astype(str)
        for trn_idx, val_idx in fold_idx_list:
            mean_val = train_df.loc[trn_idx].groupby("tmp")["Salary"].mean().astype(np.float32)
            train_df.loc[val_idx, new_col] = train_df.loc[val_idx, "tmp"].map(mean_val)
            test_df.loc[:, new_col] += test_df.loc[:, "tmp"].map(mean_val) / len(fold_idx_list)
        train_df[new_col] = train_df[new_col].astype(np.float32)
        test_df[new_col] = test_df[new_col].astype(np.float32)
        del train_df["tmp"], test_df["tmp"]
        gc.collect()
    return train_df, test_df

In [29]:
train_df, test_df = multiple_target_encoding(train_df, test_df)

In [30]:
use_cols = [c for c in train_df.columns if c not in multi_cat_cols + ["Salary", "No", "ohe_main_cluster"]]
print(len(use_cols))

2495


## Training

In [31]:
lgb_params = {
            'objective': 'poisson',
            "metric": "rmse",
            "verbosity": -1,
            "boosting": "gbdt",
            'learning_rate': 0.05,
            'num_leaves': 64,
            'min_data_in_leaf': 80, 
            'max_depth': 4,
            "bagging_freq": 5,
            "bagging_fraction": 0.8,
            "lambda_l1": 0.5,
            "lambda_l2": 0.5,
            "feature_fraction": 0.1,
            "seed": 2020,
            "num_threads": -1,
            "max_bins": 30
}

In [32]:
def feature_selection(train_df, use_cols, n_features=1000):
    df = train_df.sample(30000, random_state=2020)
    train_dataset = lgb.Dataset(
        df.loc[:, use_cols],
        label = df.loc[:, "Salary"]
    )
    model = lgb.train(
                lgb_params,
                train_dataset,
                2000,
                valid_sets = [train_dataset],
                verbose_eval=200,
                early_stopping_rounds = None,
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = use_cols
    imp_df['gain'] = model.feature_importance(importance_type="gain")
    select_features = imp_df.sort_values(["gain"], ascending=False).iloc[: n_features]["feature"].tolist()
    return select_features

In [33]:
importances = pd.DataFrame()
oof_preds = np.zeros(len(train_df))
models1 = []

# use_cols = feature_selection(train_df, use_cols, n_features=1000)

for fold_i, (trn_idx, val_idx) in enumerate(fold_idx_list):
    print(f"Fold {fold_i+1}")
    train_dataset = lgb.Dataset(
        train_df.loc[trn_idx, use_cols],
        label = train_df.loc[trn_idx, "Salary"]
    )
    valid_dataset = lgb.Dataset(
        train_df.loc[val_idx, use_cols],
        label = train_df.loc[val_idx, "Salary"]
    )
    model = lgb.train(
                lgb_params,
                train_dataset,
                10000,
                valid_sets = [train_dataset, valid_dataset],
                verbose_eval=100,
                early_stopping_rounds = 300,
                #feval = eval_f1,
                #callbacks = [log_callback],
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = use_cols
    imp_df['gain'] = model.feature_importance(importance_type="gain")
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_preds[val_idx] = model.predict(train_df.loc[val_idx, use_cols])
    models1.append(model)

Fold 1
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 22677.8	valid_1's rmse: 23007.5
[200]	training's rmse: 20867.3	valid_1's rmse: 21623.9
[300]	training's rmse: 20103.6	valid_1's rmse: 21190.2
[400]	training's rmse: 19618.8	valid_1's rmse: 20961.4
[500]	training's rmse: 19249.2	valid_1's rmse: 20820.9
[600]	training's rmse: 18941.4	valid_1's rmse: 20714.1
[700]	training's rmse: 18677.8	valid_1's rmse: 20657.4
[800]	training's rmse: 18438.6	valid_1's rmse: 20595.4
[900]	training's rmse: 18223.4	valid_1's rmse: 20539.8
[1000]	training's rmse: 18010.4	valid_1's rmse: 20497
[1100]	training's rmse: 17821.3	valid_1's rmse: 20457.8
[1200]	training's rmse: 17633.1	valid_1's rmse: 20418.4
[1300]	training's rmse: 17416.3	valid_1's rmse: 20393.9
[1400]	training's rmse: 17250.4	valid_1's rmse: 20368.4
[1500]	training's rmse: 17070.6	valid_1's rmse: 20349.8
[1600]	training's rmse: 16914.7	valid_1's rmse: 20334.7
[1700]	training's rmse: 16746.5	valid_1's rmse

In [34]:
oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
oof_score

20186.5261848146

In [35]:
importances.groupby("feature")["gain"].mean().sort_values(ascending=False).reset_index().iloc[:30]

Unnamed: 0,feature,gain
0,te_Country__SalaryType,457375600.0
1,te_Country__Employment,417612500.0
2,te_Country__Age,322568700.0
3,te_Country__YearsCodingProf,312760200.0
4,te_Country__YearsCoding,295880600.0
5,te_Country__ohe_DevType_Student,283877600.0
6,te_YearsCodingProf__CurrencySymbol,203580200.0
7,te_Country__ohe_Methodology_Agile,197836400.0
8,te_Country__FormalEducation,186664800.0
9,te_SalaryType__CurrencySymbol,168539700.0


In [36]:
len(use_cols)

2495

In [37]:
train_df["residual_error"] = oof_preds - train_df["Salary"]

In [51]:
def lgb_train2(train_df, use_cols, fold_idx_list, data_type="main"):

    importances2 = pd.DataFrame()
    oof_preds2 = np.zeros(len(train_df))
    models2 = []
    
    _lgb_params = lgb_params.copy()
    _lgb_params["max_depth"] = -1
    _lgb_params["num_leaves"] = 128
    _lgb_params["learning_rate"] = 0.001
    _lgb_params["min_data_in_leaf"] = 10
    _lgb_params["objective"] = "regression"

    for fold_i, (trn_idx, val_idx) in enumerate(fold_idx_list):
        print(f"Fold {fold_i+1}")
        train_data = train_df.loc[trn_idx]
        valid_data = train_df.loc[val_idx]
        if data_type == "main":
            train_data = train_data[train_data.ohe_main_cluster == 1]
            valid_data = valid_data[valid_data.ohe_main_cluster == 1]
        elif data_type == "not_main":
            train_data = train_data[train_data.ohe_main_cluster == 0]
            valid_data = valid_data[valid_data.ohe_main_cluster == 0]
        train_dataset = lgb.Dataset(
            train_data[use_cols],
            label = train_data["residual_error"]
        )
        valid_dataset = lgb.Dataset(
            valid_data[use_cols],
            label = valid_data["residual_error"]
        )
        model = lgb.train(
                    _lgb_params,
                    train_dataset,
                    3000,
                    valid_sets = [train_dataset, valid_dataset],
                    verbose_eval=100,
                    early_stopping_rounds = 300,
        )
        imp_df = pd.DataFrame()
        imp_df['feature'] = use_cols
        imp_df['gain'] = model.feature_importance(importance_type="gain")
        importances2 = pd.concat([importances2, imp_df], axis=0, sort=False)
        models2.append(model)
    return models2, oof_preds2, importances2

In [52]:
models2, oof_preds2, importances2 = lgb_train2(train_df, use_cols, fold_idx_list, data_type="main")

Fold 1
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 22243.4	valid_1's rmse: 22304.9
[200]	training's rmse: 21929.1	valid_1's rmse: 22321.2
[300]	training's rmse: 21623.4	valid_1's rmse: 22332.8
Early stopping, best iteration is:
[29]	training's rmse: 22470.1	valid_1's rmse: 22297.5
Fold 2
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 22217.2	valid_1's rmse: 22432.1
[200]	training's rmse: 21910.5	valid_1's rmse: 22438.5
[300]	training's rmse: 21611.2	valid_1's rmse: 22446.8
Early stopping, best iteration is:
[3]	training's rmse: 22523.2	valid_1's rmse: 22423.1
Fold 3
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 22094.2	valid_1's rmse: 22930.4
[200]	training's rmse: 21790.9	valid_1's rmse: 22935.8
[300]	training's rmse: 21492.2	valid_1's rmse: 22944.5
Early stopping, best iteration is:
[6]	training's rmse: 22389.8	valid_1's rmse: 22918.8
Fold 4
Training until validation 

In [53]:
models2, oof_preds2, importances2 = lgb_train2(train_df, use_cols, fold_idx_list, data_type="not_main")

Fold 1
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 18148.7	valid_1's rmse: 18686.7
[200]	training's rmse: 17911.2	valid_1's rmse: 18691.3
[300]	training's rmse: 17682.5	valid_1's rmse: 18694.1
Early stopping, best iteration is:
[1]	training's rmse: 18389.3	valid_1's rmse: 18681.4
Fold 2
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 18214.8	valid_1's rmse: 18447.2
[200]	training's rmse: 17981.3	valid_1's rmse: 18457.1
[300]	training's rmse: 17756.8	valid_1's rmse: 18468.3
Early stopping, best iteration is:
[1]	training's rmse: 18451.8	valid_1's rmse: 18436.2
Fold 3
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 18174.5	valid_1's rmse: 18606.6
[200]	training's rmse: 17943.1	valid_1's rmse: 18610.2
[300]	training's rmse: 17721.2	valid_1's rmse: 18612.4
Early stopping, best iteration is:
[7]	training's rmse: 18395.1	valid_1's rmse: 18602.5
Fold 4
Training until validation s

KeyboardInterrupt: 

## Prediction

In [35]:
test_pred = np.zeros(len(test_df))

for model in models:
    test_pred += model.predict(test_df[use_cols]) / len(models)

In [36]:
test_pred

array([ 67772.21736985,  91906.65505641,  89335.72472625, ...,
        87460.2147288 ,  81660.45977467, 117499.75140813])

In [37]:
sub_df = pd.read_csv("../input/submit.csv")

In [38]:
sub_df["Salary"] = test_pred

In [40]:
# sub_df.to_csv("../predict/024_col_shuffle_20186.csv", index=False)