In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
import lightgbm as lgb

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree

In [37]:
testa= pd.read_csv('tcd-ml-1920-group-income-train.csv', delimiter = ',')

In [38]:
testb= pd.read_csv('tcd-ml-1920-group-income-test.csv', delimiter = ',')

In [39]:
full_data = pd.concat([testa, testb], sort=False)

In [40]:
full_data['Housing Situation'] = full_data['Housing Situation'].replace('nA', np.nan)
full_data['Housing Situation'] = full_data['Housing Situation'].replace('0', np.nan)

full_data['Work Experience in Current Job [years]'] = full_data['Work Experience in Current Job [years]'].replace('#NUM!', np.nan)
full_data['Yearly Income in addition to Salary (e.g. Rental Income)'] = full_data['Yearly Income in addition to Salary (e.g. Rental Income)'].replace({' EUR':''}, regex = True)

full_data['Work Experience in Current Job [years]'] = full_data['Work Experience in Current Job [years]'].astype(float)
full_data['Yearly Income in addition to Salary (e.g. Rental Income)'] = full_data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(float)

In [41]:
for col in full_data.columns:
        if full_data[col].dtype == object:
            full_data[col].fillna(full_data[col].mode()[0], inplace = True)
        else:
            full_data[col].fillna(full_data[col].mean(), inplace = True)

In [42]:
labels_t = full_data['Total Yearly Income [EUR]']

In [43]:
encoder = ce.TargetEncoder(cols=['Hair Color', 'Housing Situation','Satisfation with employer' ,'Country', 'Profession', 'University Degree', 'Gender'])
encoder.fit(full_data, labels_t)
full_data = encoder.transform(full_data)

In [44]:
train_features = full_data[:1048574]
test_features = full_data[-(len(full_data)-1048574):]

In [45]:
train_features= train_features.drop('Instance', axis=1)
test_features= test_features.drop('Instance', axis=1)
test_features= test_features.drop('Total Yearly Income [EUR]', axis=1)

test_features= test_features.drop('Hair Color', axis=1)
train_features= train_features.drop('Hair Color', axis=1)

test_features= test_features.drop('Wears Glasses', axis=1)
train_features= train_features.drop('Wears Glasses', axis=1)

In [47]:
labels = train_features['Total Yearly Income [EUR]']
train_features = train_features.drop(['Total Yearly Income [EUR]'], axis = 1)

In [48]:
labels = np.log(labels)
X_test, X_val, y_test, y_val = train_test_split(train_features, labels, test_size = 0.3, random_state = 42)

In [50]:
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=10000)
gbm.fit(X_test, y_test,
        eval_set=[(X_val, y_val)],
        eval_metric='l1',
        early_stopping_rounds=10)

[1]	valid_0's l1: 1.40734	valid_0's l2: 2.8316
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 1.34165	valid_0's l2: 2.58006
[3]	valid_0's l1: 1.27943	valid_0's l2: 2.35252
[4]	valid_0's l1: 1.22043	valid_0's l2: 2.14678
[5]	valid_0's l1: 1.16435	valid_0's l2: 1.96036
[6]	valid_0's l1: 1.11131	valid_0's l2: 1.79194
[7]	valid_0's l1: 1.06111	valid_0's l2: 1.6388
[8]	valid_0's l1: 1.01382	valid_0's l2: 1.50107
[9]	valid_0's l1: 0.968694	valid_0's l2: 1.37551
[10]	valid_0's l1: 0.926047	valid_0's l2: 1.26255
[11]	valid_0's l1: 0.885695	valid_0's l2: 1.15987
[12]	valid_0's l1: 0.847516	valid_0's l2: 1.06627
[13]	valid_0's l1: 0.811282	valid_0's l2: 0.981901
[14]	valid_0's l1: 0.776987	valid_0's l2: 0.905004
[15]	valid_0's l1: 0.744729	valid_0's l2: 0.835736
[16]	valid_0's l1: 0.714202	valid_0's l2: 0.772079
[17]	valid_0's l1: 0.685065	valid_0's l2: 0.714406
[18]	valid_0's l1: 0.657671	valid_0's l2: 0.661991
[19]	valid_0's l1: 0.631953	valid_0's l2: 0.614615
[

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [51]:
income_pred = gbm.predict(test_features)

In [52]:
income_pred = np.exp(income_pred)

In [53]:
df = pd.read_csv('tcd-ml-1920-group-income-submission.csv')
df['Total Yearly Income [EUR]'] = income_pred
df.to_csv('mean.csv', encoding='utf-8', index = False)