In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
def create_feat_count(df,feat):
    feat_count = df.groupby([feat]).size().reset_index()
    feat_count.columns = [feat,'%s_count'%(feat)]
    df = df.merge(feat_count,how='left',on=[feat])
    return df

In [None]:
# Reading in the test and training data
train = pd.read_csv("group-income-train.csv")
test = pd.read_csv("group-income-test.csv")
data = pd.concat([train,test],ignore_index=True)

In [2]:
# Coverting Additional Income to Ints for Easier Processing
data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].map(lambda x:x.replace(' EUR',''))
data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(float)
data['Yearly Income in addition to Salary (e.g. Rental Income)']=data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(int)                            

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Label Encoding all Necessary Columns for Processing along with creating a feature count col
cols = data.columns.tolist()
feat_cols = [col for col in data.columns if col not in ['Instance','Total Yearly Income [EUR]']]
for col in feat_cols:
    data = create_feat_count(data,col)
feat_cols = [col for col in data.columns if col not in ['Instance','Total Yearly Income [EUR]']]
obj_col = data[feat_cols].dtypes[data[feat_cols].dtypes == 'object'].index.tolist()
for col in obj_col:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

In [None]:
# Splitting our nicely formatted data back into test and training sets
train = data[data['Total Yearly Income [EUR]'].notnull()]
test = data[data['Total Yearly Income [EUR]'].isnull()]    

In [None]:
# This messy pip install was for convenience when running on Google Colab/AWS Sagemaker:
!pip install lightgbm
import lightgbm as lgb

# Running a k-fold cross validation as in:
# https://machinelearningmastery.com/k-fold-cross-validation/
# Using tweedie distribution with gdbt boosting
params = {
          'max_depth': 30,
          'learning_rate': 0.02,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'objective':'tweedie',
          'gpu_platform_id': 0,
          'gpu_device_id': 0,
          'num_iterations' : 200000,
         }
# N-folds opted for 5, according to researched material online 5 or 10 can be ideal for this process
folds = 5
seed = 2019
pre_sub = pd.DataFrame()
kf = StratifiedKFold(n_splits=folds,shuffle=True,random_state=seed)
ix = 0
for tr_idx,val_idx in kf.split(train,train['Country']):
    x_train,y_train = train[feat_cols].iloc[tr_idx],train['Total Yearly Income [EUR]'].iloc[tr_idx]
    x_val,y_val = train[feat_cols].iloc[val_idx],train['Total Yearly Income [EUR]'].iloc[val_idx]
    trn_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_val, label=y_val)
    # 15000 Redundant now as overridden with num_iterations
    clf = lgb.train(params, trn_data, 15000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    test_pre = clf.predict(test[feat_cols])
    pre_sub[ix] = test_pre
    ix += 1
'done'

Collecting lightgbm
  Using cached https://files.pythonhosted.org/packages/05/ec/756f13b25258e0aa6ec82d98504e01523814f95fc70718407419b8520e1d/lightgbm-2.3.0-py2.py3-none-manylinux1_x86_64.whl
[31mtyping-extensions 3.7.4.1 has requirement typing>=3.7.4; python_version < "3.5", but you'll have typing 3.6.4 which is incompatible.[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.0
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m




Training until validation scores don't improve for 500 rounds
[1000]	training's l1: 8679.88	valid_1's l1: 8853.34
[2000]	training's l1: 8233.09	valid_1's l1: 8493.82
[3000]	training's l1: 7969.92	valid_1's l1: 8319.73
[4000]	training's l1: 7767.84	valid_1's l1: 8210.76
[5000]	training's l1: 7605.47	valid_1's l1: 8141.13
[6000]	training's l1: 7472.62	valid_1's l1: 8090.25
[7000]	training's l1: 7355.37	valid_1's l1: 8051.32
[8000]	training's l1: 7239.3	valid_1's l1: 8022.13
[9000]	training's l1: 7132.37	valid_1's l1: 7998.57
[10000]	training's l1: 7034.02	valid_1's l1: 7981.21
[11000]	training's l1: 6938.42	valid_1's l1: 7961.07
[12000]	training's l1: 6854.89	valid_1's l1: 7949.37
[13000]	training's l1: 6767.37	valid_1's l1: 7937.65
[14000]	training's l1: 6686.68	valid_1's l1: 7928.79
[15000]	training's l1: 6610.69	valid_1's l1: 7919.51
Did not meet early stopping. Best iteration is:
[15000]	training's l1: 6610.69	valid_1's l1: 7919.51
Training until validation scores don't improve for 5

In [None]:
# Getting the mean of 5-fold cross validation and using as answer
pre_sub['sum'] = pre_sub[[0,1,2,3,4]].mean(axis=1)
pre_sub.head()

In [None]:
# Printing resolves to CSV
sub = pd.DataFrame()
sub['Instance'] = test['Instance'].tolist()
sub['Total Yearly Income [EUR]'] = pre_sub['sum'].values
sub.to_csv("awssubmission.csv",index=False)
'done'