## Prepare and read data

In [1]:
import gc
import itertools
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option("max_columns", 300)
pd.set_option("max_rows", 500)

In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

## Feature Engineering

In [4]:
def get_multi_cat_cols(train_df):
    tmp = train_df.iloc[:1000]
    multi_cols = []
    for c in train_df.columns:
        sep_num = tmp[c].astype(str).fillna("").str.contains(";").sum()
        if sep_num > 10:
            multi_cols.append(c)
    return multi_cols

In [5]:
original_cols = train_df.columns.tolist()

In [6]:
multi_cat_cols = get_multi_cat_cols(train_df)

In [7]:
for c in tqdm(multi_cat_cols):
    binarizer = MultiLabelBinarizer()
    train_multi_srs = train_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    test_multi_srs = test_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    train_arr = binarizer.fit_transform(train_multi_srs)
    test_arr = binarizer.transform(test_multi_srs)
    feat_cols = [f"ohe_{c}_{val}" for val in binarizer.classes_]
    train_feat_df = pd.DataFrame(train_arr, columns=feat_cols, dtype=np.int8)
    test_feat_df = pd.DataFrame(test_arr, columns=feat_cols, dtype=np.int8)
    train_feat_df[f"sum_answer_{c}"] = (train_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
    test_feat_df[f"sum_answer_{c}"] = (test_df[c].str.count(";") + 1).fillna(-1).astype(np.int8)
    train_df = pd.concat([train_df, train_feat_df], axis=1)
    test_df = pd.concat([test_df, test_feat_df], axis=1)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [8]:
other_cols = [
    c for c in list(np.setdiff1d(original_cols, multi_cat_cols)) if c not in ["Salary", "No"]
]

In [9]:
cat_cols = []
for c in tqdm(other_cols):
    if "float" in train_df[c].dtype.name:
        if train_df[c].nunique() > 1000:
            train_df[c] = train_df[c].astype(np.float32)
            test_df[c] = test_df[c].astype(np.float32)
        else:
            train_df[c] = train_df[c].astype(np.float16)
            test_df[c] = test_df[c].astype(np.float16)
    else:
        train_df[c], uniques = pd.factorize(train_df[c], sort=True)
        test_df[c] = uniques.get_indexer(test_df[c])
        if train_df[c].max() > 32767:
            train_df[c] = train_df[c].astype(np.int32)
            test_df[c] = test_df[c].astype(np.int32)
        else:
            train_df[c] = train_df[c].astype(np.int16)
            test_df[c] = test_df[c].astype(np.int16)            
        cat_cols.append(c)

HBox(children=(FloatProgress(value=0.0, max=105.0), HTML(value='')))




In [10]:
train_df.shape, test_df.shape

((33857, 465), (11259, 464))

In [11]:
train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]
test_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_df.columns]

In [12]:
# basicなモデルのimportanceを読み込み
importance_df = pd.read_csv("../data/importance/003_importance.csv")

In [13]:
imp_feats = importance_df["feature"].iloc[:50].tolist()
imp_cat_cols = [c for c in imp_feats if c in cat_cols or c[:4] == "ohe_"]
imp_nume_cols = [c for c in imp_feats if c not in cat_cols and c not in multi_cat_cols]

In [14]:
len(imp_cat_cols), len(imp_nume_cols)

(32, 15)

In [15]:
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
for cat_col in tqdm(imp_cat_cols):
    for nume_col in imp_nume_cols:
        all_df[f"agg_mean_{cat_col}_{nume_col}"] = \
            all_df.groupby(cat_col)[nume_col].transform("mean").astype(np.float32)
        all_df[f"agg_std_{cat_col}_{nume_col}"] = \
            all_df.groupby(cat_col)[nume_col].transform("std").astype(np.float32)
        all_df[f"agg_max_{cat_col}_{nume_col}"] = \
            all_df.groupby(cat_col)[nume_col].transform("max").astype(np.float32)
        all_df[f"agg_min_{cat_col}_{nume_col}"] = \
            all_df.groupby(cat_col)[nume_col].transform("min").astype(np.float32)
        all_df[f"diff_agg_mean_{cat_col}_{nume_col}"] = \
            all_df[nume_col] - all_df[f"agg_mean_{cat_col}_{nume_col}"]
        all_df[f"rel_agg_mean_{cat_col}_{nume_col}"] = \
            all_df[nume_col] / (1 + all_df[f"agg_mean_{cat_col}_{nume_col}"])
train_df = all_df.iloc[:len(train_df)].reset_index(drop=True)
test_df = all_df.iloc[len(train_df):].reset_index(drop=True)
del all_df
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




61

In [16]:
use_cols = [c for c in train_df.columns if c not in multi_cat_cols + ["Salary", "No"]]

In [17]:
folds = KFold(n_splits=5, random_state=2020, shuffle=True)
fold_idx_list = [(trn_idx, val_idx) for trn_idx, val_idx in folds.split(train_df, train_df["Salary"])]

### Target Encoding

In [18]:
te_cols = [c for c in use_cols if (not "agg_" in c) and train_df[c].nunique() >= 5]

In [19]:
for c in tqdm(te_cols):
    new_col = "te_" + c
    train_df[new_col] = 0
    test_df[new_col] = 0
    for trn_idx, val_idx in fold_idx_list:
        mean_val = train_df.loc[trn_idx].groupby(c)["Salary"].mean().astype(np.float32)
        train_df.loc[val_idx, new_col] = train_df.loc[val_idx, c].map(mean_val)
        test_df.loc[:, new_col] += test_df.loc[:, c].map(mean_val) / len(fold_idx_list)
    train_df[new_col] = train_df[new_col].astype(np.float32)
    test_df[new_col] = test_df[new_col].astype(np.float32)

HBox(children=(FloatProgress(value=0.0, max=109.0), HTML(value='')))




In [20]:
# multiple target encoding
te_imp_feats = importance_df["feature"].iloc[:30].tolist()
multi_te_cols = [c for c in te_cols if c in te_imp_feats]
combi_cols = list(itertools.combinations(multi_te_cols, 2))

In [21]:
for col_a, col_b in tqdm(combi_cols):
    new_col = "te_" + col_a + "__" + col_b
    train_df[new_col] = 0
    test_df[new_col] = 0
    train_df["tmp"] = train_df[col_a].fillna("").astype(str) + train_df[col_b].fillna("").astype(str)
    test_df["tmp"] = test_df[col_a].fillna("").astype(str) + test_df[col_b].fillna("").astype(str)
    for trn_idx, val_idx in fold_idx_list:
        mean_val = train_df.loc[trn_idx].groupby("tmp")["Salary"].mean().astype(np.float32)
        train_df.loc[val_idx, new_col] = train_df.loc[val_idx, "tmp"].map(mean_val)
        test_df.loc[:, new_col] += test_df.loc[:, "tmp"].map(mean_val) / len(fold_idx_list)
    train_df[new_col] = train_df[new_col].astype(np.float32)
    test_df[new_col] = test_df[new_col].astype(np.float32)
    del train_df["tmp"], test_df["tmp"]
    gc.collect()

HBox(children=(FloatProgress(value=0.0, max=66.0), HTML(value='')))




In [22]:
use_cols = [c for c in train_df.columns if c not in multi_cat_cols + ["Salary", "No"]]
print(len(use_cols))

3497


## Training

In [60]:
lgb_params = {
            'objective': 'regression',
            "metric": "rmse",
            "verbosity": -1,
            "boosting": "gbdt",
            'learning_rate': 0.05,
            'num_leaves': 64,
            'min_data_in_leaf': 80, 
            'max_depth': 4,
            "bagging_freq": 5,
            "bagging_fraction": 0.8,
            "lambda_l1": 0.5,
            "lambda_l2": 0.5,
            "feature_fraction": 0.1,
            "seed": 2020,
            "num_threads": -1,
            "max_bins": 30
}

In [61]:
importances = pd.DataFrame()
oof_preds = np.zeros(len(train_df))
models = []

for fold_i, (trn_idx, val_idx) in enumerate(fold_idx_list):
    print(f"Fold {fold_i+1}")
    train_dataset = lgb.Dataset(
        train_df.loc[trn_idx, use_cols],
        label = train_df.loc[trn_idx, "Salary"]
    )
    valid_dataset = lgb.Dataset(
        train_df.loc[val_idx, use_cols],
        label = train_df.loc[val_idx, "Salary"]
    )
    model = lgb.train(
                lgb_params,
                train_dataset,
                3000,
                valid_sets = [train_dataset, valid_dataset],
                verbose_eval=100,
                early_stopping_rounds = 100,
                #feval = eval_f1,
                #callbacks = [log_callback],
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = use_cols
    imp_df['gain'] = model.feature_importance(importance_type="gain")
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_preds[val_idx] = model.predict(train_df.loc[val_idx, use_cols])
    models.append(model)

Fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 20982.1	valid_1's rmse: 21698.5
[200]	training's rmse: 19794.4	valid_1's rmse: 21079
[300]	training's rmse: 19166.4	valid_1's rmse: 20868.6
[400]	training's rmse: 18646.3	valid_1's rmse: 20751.4
[500]	training's rmse: 18226.2	valid_1's rmse: 20681.9
[600]	training's rmse: 17856.9	valid_1's rmse: 20635.8
[700]	training's rmse: 17494.1	valid_1's rmse: 20598
[800]	training's rmse: 17174.3	valid_1's rmse: 20586.4
[900]	training's rmse: 16883.3	valid_1's rmse: 20563.7
[1000]	training's rmse: 16598.9	valid_1's rmse: 20557.1
[1100]	training's rmse: 16342.4	valid_1's rmse: 20539.5
[1200]	training's rmse: 16094.7	valid_1's rmse: 20528.4
[1300]	training's rmse: 15847.1	valid_1's rmse: 20520.6
[1400]	training's rmse: 15604.6	valid_1's rmse: 20512.3
[1500]	training's rmse: 15361.8	valid_1's rmse: 20518.8
Early stopping, best iteration is:
[1445]	training's rmse: 15497.3	valid_1's rmse: 20502.6
Fold 2
Traini

In [62]:
oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
oof_score
#20824.817522627924
#20794.69950160029
#20548.806990829173
#20366.916244725206
#20352.221759954566

20325.986687487057

In [63]:
importances.groupby("feature")["gain"].mean().sort_values(ascending=False).reset_index().iloc[:100]

Unnamed: 0,feature,gain
0,te_Country__YearsCoding,47081960000000.0
1,te_Country__CareerSatisfaction,29315800000000.0
2,te_Country__YearsCodingProf,19667220000000.0
3,te_Country__Employment,19093820000000.0
4,te_YearsCodingProf__Currency,16644660000000.0
5,te_Country__FormalEducation,15220140000000.0
6,te_Country__Age,14215780000000.0
7,te_Country__CurrencySymbol,12011100000000.0
8,te_CurrencySymbol__Age,8642652000000.0
9,te_Employment__CurrencySymbol,6998477000000.0


In [27]:
test_pred = np.zeros(len(test_df))

for model in models:
    test_pred += model.predict(test_df[use_cols]) / len(models)

In [28]:
test_pred

array([ 68716.55939292,  95191.38058141,  85564.39388332, ...,
        90746.82732717,  84125.51070933, 114913.58252493])

In [29]:
sub_df = pd.read_csv("../input/submit.csv")

In [30]:
sub_df["Salary"] = test_pred

In [31]:
#sub_df.to_csv("../predict/010_multi_te_20780.csv", index=False)