## Prepare and read data

In [1]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option("max_columns", 300)
pd.set_option("max_rows", 500)

In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

## Feature Engineering

In [4]:
def get_multi_cat_cols(train_df):
    tmp = train_df.iloc[:1000]
    multi_cols = []
    for c in train_df.columns:
        sep_num = tmp[c].astype(str).fillna("").str.contains(";").sum()
        if sep_num > 10:
            multi_cols.append(c)
    return multi_cols

In [5]:
original_cols = train_df.columns.tolist()

In [6]:
multi_cat_cols = get_multi_cat_cols(train_df)

In [7]:
for c in tqdm(multi_cat_cols):
    binarizer = MultiLabelBinarizer()
    train_multi_srs = train_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    test_multi_srs = test_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    train_arr = binarizer.fit_transform(train_multi_srs)
    test_arr = binarizer.transform(test_multi_srs)
    feat_cols = [f"ohe_{c}_{val}" for val in binarizer.classes_]
    train_feat_df = pd.DataFrame(train_arr, columns=feat_cols)
    test_feat_df = pd.DataFrame(test_arr, columns=feat_cols)
    train_df = pd.concat([train_df, train_feat_df], axis=1)
    test_df = pd.concat([test_df, test_feat_df], axis=1)

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [8]:
label_cols = [
    c for c in original_cols if c not in ["Salary", "No"]
]

In [9]:
for c in tqdm(label_cols):
    if train_df[c].dtype.name == "float":
        continue
    else:
        train_df[c], uniques = pd.factorize(train_df[c])
        test_df[c] = uniques.get_indexer(test_df[c])

HBox(children=(FloatProgress(value=0.0, max=126.0), HTML(value='')))




In [10]:
train_df.shape

(33857, 444)

In [11]:
train_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_df.columns]

In [12]:
use_cols = [c for c in train_df.columns if c not in ["Salary", "No"]]

## Training

In [13]:
folds = KFold(n_splits=5, random_state=2020, shuffle=True)

In [14]:
lgb_params = {
            'objective': 'regression',
            "metric": "rmse",
            "verbosity": -1,
            "boosting": "gbdt",
            'learning_rate': 0.05,
            'num_leaves': 128,
            'min_data_in_leaf': 30, 
            'max_depth': 6,
            "bagging_freq": 10,
            "bagging_fraction": 0.4,
            "lambda_l1": 0.5,
            "lambda_l2": 0.5,
            "feature_fraction": 0.7,
            "seed": 2020,
            "num_threads": -1,
            "max_bins": 30
}

In [15]:
importances = pd.DataFrame()
oof_preds = np.zeros(len(train_df))

for fold_i, (trn_idx, val_idx) in enumerate(folds.split(train_df, train_df["Salary"])):
    print(f"Fold {fold_i+1}")
    train_dataset = lgb.Dataset(
        train_df.loc[trn_idx, use_cols],
        label = train_df.loc[trn_idx, "Salary"]
    )
    valid_dataset = lgb.Dataset(
        train_df.loc[val_idx, use_cols],
        label = train_df.loc[val_idx, "Salary"]
    )
    model = lgb.train(
                lgb_params,
                train_dataset,
                3000,
                valid_sets = [train_dataset, valid_dataset],
                verbose_eval=100,
                early_stopping_rounds = 100,
                #feval = eval_f1,
                #callbacks = [log_callback],
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = use_cols
    imp_df['gain'] = model.feature_importance(importance_type="gain")
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_preds[val_idx] = model.predict(train_df.loc[val_idx, use_cols])

Fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 21416.2	valid_1's rmse: 23249.3
[200]	training's rmse: 19391.7	valid_1's rmse: 22562.1
[300]	training's rmse: 18163	valid_1's rmse: 22312.5
[400]	training's rmse: 17189.1	valid_1's rmse: 22235.9
[500]	training's rmse: 16270.6	valid_1's rmse: 22177.8
[600]	training's rmse: 15467.6	valid_1's rmse: 22144.2
[700]	training's rmse: 14792.9	valid_1's rmse: 22119.2
[800]	training's rmse: 14068	valid_1's rmse: 22095.5
[900]	training's rmse: 13424.7	valid_1's rmse: 22112.6
Early stopping, best iteration is:
[824]	training's rmse: 13904.2	valid_1's rmse: 22089.5
Fold 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 21502.7	valid_1's rmse: 23021.3
[200]	training's rmse: 19714.2	valid_1's rmse: 22225.3
[300]	training's rmse: 18476.2	valid_1's rmse: 21942.4
[400]	training's rmse: 17445.7	valid_1's rmse: 21823.5
[500]	training's rmse: 16538.2	valid_1's rmse: 21742.7
[600]	

In [16]:
oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
oof_score

21879.878828725617

In [17]:
importances.groupby("feature")["gain"].mean().sort_values(ascending=False).reset_index()

Unnamed: 0,feature,gain
0,MilitaryUS,39102420000000.0
1,Country,22735330000000.0
2,YearsCodingProf,22357710000000.0
3,CurrencySymbol,16040690000000.0
4,SalaryType,9762817000000.0
5,YearsCoding,8217953000000.0
6,ohe_DevType_Student,6323186000000.0
7,Employment,5856388000000.0
8,Currency,5335072000000.0
9,Age,4921390000000.0


In [20]:
importance_df = \
    importances.groupby("feature")["gain"].mean().sort_values(ascending=False).reset_index()

In [21]:
importance_df.to_csv("../data/importance/003_importance.csv", index=False)