## Prepare and read data

In [1]:
import gc
import itertools
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

In [2]:
pd.set_option("max_columns", 300)
pd.set_option("max_rows", 500)

In [3]:
def feature_cache(reset=False):
    def _feature_cache(func):
        def wrapper(train_df, test_df, *args):
            func_name = func.__name__
            train_feat_path = Path("../feature") / f"train_{func_name}.pkl"
            test_feat_path = Path("../feature") / f"test_{func_name}.pkl"
            # if feature exists, load feature
            if train_feat_path.exists() and test_feat_path.exists() and not reset:
                train_feats = pd.read_pickle(train_feat_path).reset_index(drop=True)
                test_feats = pd.read_pickle(test_feat_path).reset_index(drop=True)
                train_df = pd.concat([train_df, train_feats], axis=1)
                test_df = pd.concat([test_df, test_feats], axis=1)
            # if not exists, make feature and save as pickle
            else:
                before_cols = train_df.columns.tolist()
                train_df, test_df = func(train_df, test_df, *args)
                after_cols = train_df.columns.tolist()
                new_cols = [c for c in after_cols if c not in before_cols]
                train_feats = train_df[new_cols]
                test_feats = test_df[new_cols]
                train_feats.to_pickle(train_feat_path)
                test_feats.to_pickle(test_feat_path)            
            return train_df, test_df
        return wrapper

    return _feature_cache

## Preprocess

In [4]:
def preprocess(train_df, test_df):
    
    ###########################
    # Functions of preprocess
    ###########################
    def get_multi_cat_cols(train_df):
        tmp = train_df.iloc[:1000]
        multi_cols = []
        for c in train_df.columns:
            sep_num = tmp[c].astype(str).fillna("").str.contains(";").sum()
            if sep_num > 10:
                multi_cols.append(c)
        return multi_cols

    def add_rank_feature(df):
        rank_prefix_list = [
            "AssessBenefits",
            "AssessJob",
            "JobContactPriorities",
            "JobEmailPriorities",
            "AdsPriorities",
        ]
        for prefix in tqdm(rank_prefix_list):
            rank_cols = [c for c in df.columns if prefix in c]
            col_pairs = itertools.combinations(rank_cols, 2)
            for col_a, col_b in col_pairs:
                df[f"rank_diff_{prefix}_{col_a}_{col_b}"] = (df[col_a] - df[col_b]) / np.log2(df[[col_a, col_b]].max(axis=1))
        return df

    def get_basic_importance_cols(use_num=50):
        # basicなモデルのimportanceを読み込み
        importance_df = pd.read_csv("../data/importance/003_importance.csv")
        imp_feats = importance_df["feature"].iloc[:use_num].tolist()
        return imp_feats

    def make_agg_feature(train_df, test_df):
        imp_feats = get_basic_importance_cols(use_num=50)
        imp_cat_cols = [c for c in cat_cols if c in imp_feats] + non_basic_cat_cols
        imp_nume_cols = [c for c in nume_cols if c in imp_feats] + non_basic_nume_cols
        print(f"use cat col: {len(imp_cat_cols)}  nume col: {len(imp_nume_cols)}")
        #imp_nume_cols += [c for c in train_df.columns if c[:8] == "sum_answ"]
        all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
        for cat_col in tqdm(imp_cat_cols):
            for nume_col in imp_nume_cols:
                # one-hotは同じカテゴリの場合がある
                if cat_col == nume_col:
                    continue
                all_df[f"agg_mean_{cat_col}_{nume_col}"] = \
                    all_df.groupby(cat_col)[nume_col].transform("mean").astype(np.float32)
                all_df[f"agg_std_{cat_col}_{nume_col}"] = \
                    all_df.groupby(cat_col)[nume_col].transform("std").astype(np.float32)
                all_df[f"agg_max_{cat_col}_{nume_col}"] = \
                    all_df.groupby(cat_col)[nume_col].transform("max").astype(np.float32)
                all_df[f"agg_min_{cat_col}_{nume_col}"] = \
                    all_df.groupby(cat_col)[nume_col].transform("min").astype(np.float32)
                all_df[f"diff_agg_mean_{cat_col}_{nume_col}"] = \
                    all_df[nume_col] - all_df[f"agg_mean_{cat_col}_{nume_col}"]
                all_df[f"rel_agg_mean_{cat_col}_{nume_col}"] = \
                    all_df[nume_col] / (1 + all_df[f"agg_mean_{cat_col}_{nume_col}"])
        train_df = all_df.iloc[:len(train_df)].reset_index(drop=True)
        test_df = all_df.iloc[len(train_df):].reset_index(drop=True)
        del all_df
        gc.collect()
        return train_df, test_df

    @feature_cache(reset=False)
    def target_encoding(train_df, test_df):
        te_cols = [c for c in train_df.columns if c in cat_cols]
        for c in tqdm(te_cols):
            new_col = "te_" + c
            train_df[new_col] = 0
            test_df[new_col] = 0
            for trn_idx, val_idx in fold_idx_list:
                mean_val = train_df.loc[trn_idx].groupby(c)["Salary"].mean().astype(np.float32)
                train_df.loc[val_idx, new_col] = train_df.loc[val_idx, c].map(mean_val)
                test_df.loc[:, new_col] += test_df.loc[:, c].map(mean_val) / len(fold_idx_list)
            train_df[new_col] = train_df[new_col].astype(np.float32)
            test_df[new_col] = test_df[new_col].astype(np.float32)
        return train_df, test_df

    @feature_cache(reset=False)
    def multiple_target_encoding(train_df, test_df):
         # multiple target encoding
        multi_te_cols = [c for c in train_df.columns if c in cat_cols or c[:4] == "ohe_"]
        imp_feats = get_basic_importance_cols(use_num=30)
        multi_te_cols = [c for c in multi_te_cols if c in imp_feats]
        combi_multi_te_cols = list(itertools.combinations(multi_te_cols, 2))   

        for col_a, col_b in tqdm(combi_multi_te_cols):
            new_col = "te_" + col_a + "__" + col_b
            train_df[new_col] = 0
            test_df[new_col] = 0
            train_df["tmp"] = train_df[col_a].fillna("").astype(str) + train_df[col_b].fillna("").astype(str)
            test_df["tmp"] = test_df[col_a].fillna("").astype(str) + test_df[col_b].fillna("").astype(str)
            for trn_idx, val_idx in fold_idx_list:
                mean_val = train_df.loc[trn_idx].groupby("tmp")["Salary"].mean().astype(np.float32)
                train_df.loc[val_idx, new_col] = train_df.loc[val_idx, "tmp"].map(mean_val)
                test_df.loc[:, new_col] += test_df.loc[:, "tmp"].map(mean_val) / len(fold_idx_list)
            train_df[new_col] = train_df[new_col].astype(np.float32)
            test_df[new_col] = test_df[new_col].astype(np.float32)
            del train_df["tmp"], test_df["tmp"]
            gc.collect()
        return train_df, test_df   
    
    ################################
    # Columns infomation
    ################################
    original_cols = train_df.columns.tolist()
    multi_cat_cols = get_multi_cat_cols(train_df)

    nume_cols = [
        c for c in list(np.setdiff1d(original_cols, multi_cat_cols))
        if c not in ["Salary", "No"] and "float" in train_df[c].dtype.name
    ]

    cat_cols = [c for c in train_df.columns if c not in multi_cat_cols + nume_cols + ["Salary", "No"]]

    non_basic_nume_cols = []
    non_basic_cat_cols = []

    ################################
    #  Make feature
    ################################    
    
    # rank feature
    train_df = add_rank_feature(train_df)
    test_df = add_rank_feature(test_df)

    # multi -category encoding 
    for c in tqdm(multi_cat_cols):
        binarizer = MultiLabelBinarizer()
        train_multi_srs = train_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
        test_multi_srs = test_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
        train_arr = binarizer.fit_transform(train_multi_srs)
        test_arr = binarizer.transform(test_multi_srs)
        feat_cols = [f"ohe_{c}_{val}" for val in binarizer.classes_]
        train_feat_df = pd.DataFrame(train_arr, columns=feat_cols, dtype=np.int8)
        test_feat_df = pd.DataFrame(test_arr, columns=feat_cols, dtype=np.int8)
        all_feat_df = pd.concat([train_feat_df, test_feat_df], axis=0, ignore_index=True)
        train_feat_df[f"sum_answer_{c}"] = (train_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
        test_feat_df[f"sum_answer_{c}"] = (test_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
        train_df = pd.concat([train_df, train_feat_df], axis=1)
        test_df = pd.concat([test_df, test_feat_df], axis=1)
        # ohe_featureはcategoryとnumerical両方として扱う
        nume_cols += feat_cols
        cat_cols += feat_cols
        # non_basic_nume_cols.append(f"sum_answer_{c}")
        # SVD
        svd = TruncatedSVD(n_components=2, random_state=2020)
        all_svd_feats = pd.DataFrame(svd.fit_transform(all_feat_df), columns=[f"svd_{c}_{ix}" for ix in range(2)])
        train_df = pd.concat([train_df, all_svd_feats.iloc[:len(train_df)]], axis=1)
        test_df = pd.concat([test_df, all_svd_feats.iloc[len(train_df):].reset_index(drop=True)], axis=1)
    
    # simple category encoding
    for c in cat_cols:
        train_df[c], uniques = pd.factorize(train_df[c], sort=True)
        test_df[c] = uniques.get_indexer(test_df[c])
    
    # reduce memory
    # numerical cols
    for c in nume_cols:
        if train_df[c].nunique() > 1000:
            train_df[c] = train_df[c].astype(np.float32)
            test_df[c] = test_df[c].astype(np.float32)
        else:
            train_df[c] = train_df[c].astype(np.float16)
            test_df[c] = test_df[c].astype(np.float16)
    # category cols
    for c in cat_cols:
        if train_df[c].max() > 32767:
            train_df[c] = train_df[c].astype(np.int32)
            test_df[c] = test_df[c].astype(np.int32)
        else:
            train_df[c] = train_df[c].astype(np.int16)
            test_df[c] = test_df[c].astype(np.int16)
    
    # change columns name
    train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]
    test_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_df.columns]
   
    # aggregate feature
    train_df, test_df = make_agg_feature(train_df, test_df)

    # make train/validation index list for target encoding
    folds = KFold(n_splits=5, random_state=2020, shuffle=True)
    fold_idx_list = [(trn_idx, val_idx) for trn_idx, val_idx in folds.split(train_df, train_df["Salary"])]
    
    # target encoding
    train_df, test_df = target_encoding(train_df, test_df)

    # multiple category target encoding 
    train_df, test_df = multiple_target_encoding(train_df, test_df)
    
    # make use columns list
    use_cols = [c for c in train_df.columns if c not in multi_cat_cols + ["Salary", "No"]]
    print(len(use_cols))
    
    return train_df, test_df, use_cols, fold_idx_list

## Training

In [5]:
def train(train_df, test_df, use_cols, fold_idx_list):

    lgb_params = {
                'objective': 'poisson',
                "metric": "rmse",
                "verbosity": -1,
                "boosting": "gbdt",
                'learning_rate': 0.05,
                'num_leaves': 64,
                'min_data_in_leaf': 80, 
                'max_depth': 4,
                "bagging_freq": 5,
                "bagging_fraction": 0.8,
                "lambda_l1": 0.5,
                "lambda_l2": 0.5,
                "feature_fraction": 0.1,
                "seed": 2020,
                "num_threads": -1,
                "max_bins": 30
    }
    def feature_selection(train_df, use_cols, n_features=1000):
        df = train_df.sample(30000, random_state=2020)
        train_dataset = lgb.Dataset(
            df.loc[:, use_cols],
            label = df.loc[:, "Salary"]
        )
        model = lgb.train(
                    lgb_params,
                    train_dataset,
                    2000,
                    valid_sets = [train_dataset],
                    verbose_eval=200,
                    early_stopping_rounds = None,
        )
        imp_df = pd.DataFrame()
        imp_df['feature'] = use_cols
        imp_df['gain'] = model.feature_importance(importance_type="gain")
        select_features = imp_df.sort_values(["gain"], ascending=False).iloc[: n_features]["feature"].tolist()
        return select_features
    
    def lgb_train(train_df, fold_idx_list, use_cols, feature_select=None, seed=None, *args, **kwargs):
        importances = pd.DataFrame()
        oof_preds = np.zeros(len(train_df))
        models = []
        if feature_select is not None and isinstance(feature_select, int):
            _use_cols = feature_selection(train_df, use_cols, n_features=feature_select)
        else:
            _use_cols = use_cols.copy()
        _lgb_params = lgb_params.copy()
        if seed is not None:
            _lgb_params["seed"] = seed

        for fold_i, (trn_idx, val_idx) in enumerate(fold_idx_list):
            print(f"Fold {fold_i+1}")
            train_dataset = lgb.Dataset(
                train_df.loc[trn_idx, _use_cols],
                label = train_df.loc[trn_idx, "Salary"]
            )
            valid_dataset = lgb.Dataset(
                train_df.loc[val_idx, _use_cols],
                label = train_df.loc[val_idx, "Salary"]
            )
            model = lgb.train(
                        _lgb_params,
                        train_dataset,
                        30000,
                        valid_sets = [train_dataset, valid_dataset],
                        verbose_eval=500,
                        early_stopping_rounds = 500,
                        #feval = eval_f1,
                        #callbacks = [log_callback],
            )
            imp_df = pd.DataFrame()
            imp_df['feature'] = _use_cols
            imp_df['gain'] = model.feature_importance(importance_type="gain")
            importances = pd.concat([importances, imp_df], axis=0, sort=False)
            # oof predict
            oof_preds[val_idx] = model.predict(train_df.loc[val_idx, _use_cols])
            models.append(model)

        oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
        print(f"OOF Score: {oof_score}")
        return models, oof_preds, importances, oof_score, _use_cols
    
    training_list = [
        {
            "n_features": None,
            "seed": 2020,
            "weight": 0.25
        },
        {
            "n_features": 2000,
            "seed": 2021,
            "weight": 0.25
        },
        {
            "n_features": 1500,
            "seed": 2022,
            "weight": 0.25
        },
        {
            "n_features": 1000,
            "seed": 2023,
            "weight": 0.25
        }
    
    ]
    output_list = []
    for train_config in training_list: 
        _models, _oof_preds, _importances, _oof_score, _use_cols = \
            lgb_train(train_df, fold_idx_list, use_cols, **train_config)
        output_list.append(
            {
                "models": _models,
                "oof_preds": _oof_preds,
                "importances": _importances,
                "oof_score": _oof_score,
                "use_cols": _use_cols,
                "weight": train_config["weight"]
            }
        )
    # calc total oof score
    oof_preds = np.zeros(len(train_df))
    importance_list = []
    for output in output_list:
        oof_preds += output["oof_preds"] * output["weight"]
        importance_list.append(output["importances"])
    
    oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
    print(f"Total OOF Score: {oof_score}")    
    
    display(
        pd.concat(importance_list, axis=0).groupby("feature")["gain"].mean().sort_values(ascending=False).reset_index().iloc[:50]
    )

    return train_df, test_df, output_list, oof_score

## Prediction

In [6]:
def predict(test_df, output_list, oof_score):
    test_pred = np.zeros(len(test_df))
    for output in tqdm(output_list):
        models = output["models"]
        use_cols = output["use_cols"]
        weight = output["weight"]
        for model in models:
            test_pred += (model.predict(test_df[use_cols]) / len(models)) * weight
    
    # make submit file
    sub_df = pd.read_csv("../input/submit.csv")
    sub_df["Salary"] = test_pred
    sub_df.to_csv(f"../predict/{model_title}_{oof_score}.csv", index=False)

## EndPoint

In [7]:
model_title = "039_seed_averaging"

In [8]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [9]:
train_df, test_df, use_cols, fold_idx_list = preprocess(train_df, test_df)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


use cat col: 27  nume col: 10


HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))


2495


In [10]:
train_df, test_df, output_list, oof_score = train(train_df, test_df, use_cols, fold_idx_list)

Fold 1
Training until validation scores don't improve for 500 rounds
[500]	training's rmse: 19249.2	valid_1's rmse: 20820.9
[1000]	training's rmse: 18010.4	valid_1's rmse: 20497
[1500]	training's rmse: 17070.6	valid_1's rmse: 20349.8
[2000]	training's rmse: 16288.7	valid_1's rmse: 20294.1
[2500]	training's rmse: 15569.3	valid_1's rmse: 20242.6
[3000]	training's rmse: 14908.8	valid_1's rmse: 20219.9
[3500]	training's rmse: 14277.9	valid_1's rmse: 20215
Early stopping, best iteration is:
[3386]	training's rmse: 14415.1	valid_1's rmse: 20206.8
Fold 2
Training until validation scores don't improve for 500 rounds
[500]	training's rmse: 19287.2	valid_1's rmse: 20823.7
[1000]	training's rmse: 18069	valid_1's rmse: 20486.1
[1500]	training's rmse: 17189.8	valid_1's rmse: 20344
[2000]	training's rmse: 16394	valid_1's rmse: 20243.8
[2500]	training's rmse: 15681.9	valid_1's rmse: 20207.5
[3000]	training's rmse: 15009.5	valid_1's rmse: 20194.3
[3500]	training's rmse: 14364.3	valid_1's rmse: 20177.7

Unnamed: 0,feature,gain
0,te_Country__Age,458500600.0
1,te_Country__YearsCodingProf,452014200.0
2,te_Country__Employment,344669000.0
3,te_Country__YearsCoding,344270800.0
4,te_Country__SalaryType,258862700.0
5,te_Country__ohe_DevType_Student,242594600.0
6,te_YearsCodingProf__CurrencySymbol,221436400.0
7,te_Country__FormalEducation,200276900.0
8,te_Country__CareerSatisfaction,181116400.0
9,te_SalaryType__CurrencySymbol,174391200.0


In [11]:
predict(test_df, output_list, oof_score)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


