# TCD ML Comp. 2019/20 - Income Pred. (Group)

## Team4

#### import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import sys
import lightgbm as lgb
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline

#### functions for reading in and processing data

In [2]:
def input_data(fileName, printInfo = False):
    dataFrame = pd.read_csv(fileName, low_memory = False)
    
    if printInfo:
        m, n = dataFrame.shape
        print('Number of instances: %d\nNumber of freatues: %d\n' %(m,n))
        print(dataFrame.info())
        
    return dataFrame

In [3]:
def rename_columns(dataFrame):
    """function to make column names simpler"""
    
    columns = {'Instance' : 'instance',
               'Year of Record' : 'year',
               'Housing Situation' : 'housing',
               'Crime Level in the City of Employement' : 'crime',
               'Work Experience in Current Job [years]' : 'experience',
               'Satisfation with employer' : 'satisfaction',
               'Gender' : 'gender',
               'Age': 'age',
               'Country' : 'country',
               'Size of City' : 'city',
               'Profession' : 'profession',
               'University Degree' : 'education',
               'Wears Glasses' : 'glasses',
               'Hair Color' : 'hair',
               'Body Height [cm]' : 'height',
               'Yearly Income in addition to Salary (e.g. Rental Income)' : 'additional',
               'Total Yearly Income [EUR]' : 'Y'}
    
    dataFrame.rename(columns = columns, inplace = True)

In [4]:
def split_data(dataFrame, test_size = 0.05, random_state = 42):
    train_set, test_set = train_test_split(dataFrame, test_size = test_size, random_state = random_state)
    
    mtrain, ntrain = train_set.shape
    mtest, ntest = test_set.shape
    
    print('Number of instances in training set: %d\nNumber of instances in test set: %d\n' %(mtrain, mtest))

    return train_set, test_set

In [5]:
def preprocess(data, training = False, labelled = False, printDetails = False):
    
    data = data.copy()
    mOriginal, nOriginal = data.shape
    data.drop(["instance"], axis = 1, inplace = True)
    
    if training: #drop duplicated from training data
        data.drop_duplicates(keep = 'first', inplace = True)
        m1, n1 = data.shape
        print("Number of duplicates dropped: %d\nNumber of instances after dropping dupicated: %d\n" %(mOriginal - m1, m1))
    
    
    #experience - convert to float
    data['experience'].replace(to_replace = {'#NUM!' : 0}, inplace = True)
    data['experience'] = data['experience'].astype(np.float64, copy = False)
    
    
    #additional income - remove it and save a copy so as to add to predictions.
    data['additional'] = data['additional'].str.rstrip(' EUR').astype(np.float64, copy = False)
    additional_income = data['additional'].copy()
    if labelled:
        data['Y'] = data['Y'] - data['additional']
    data['add_indicator'] = (data['additional'] > 0).astype(np.int64, copy = False)
    data.drop(["additional"], axis = 1, inplace = True)
    
    
    #hair - originally combined several hair categories but ultimately dropped hair as a feature.
    #data['hair'].replace(to_replace = {'0': 1, 'Black' : 0,'Brown' : 0, 'Blond' : 0, 'Red' : 0, 'Unknown' : 0}, inplace = True)
    data.drop(['hair'], axis = 1, inplace = True)
    
    
    #dropped data
    data.drop(['glasses'], axis = 1, inplace = True)
    
    return data, additional_income

#### construct pipeline - one pipeline per feature and then combine at end to form complete pipeline

In [6]:
year_pipeline = Pipeline([
    ('imputerYear1', SimpleImputer(strategy = "median")),
    ('ScalerYear', MinMaxScaler())
])

In [7]:
housing_pipeline = Pipeline([
    ('imputerH1', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('imputerH2', SimpleImputer(missing_values = 'nA', strategy = 'constant', fill_value = 'missing')),
    ('imputerH3', SimpleImputer(missing_values = '0', strategy = 'constant', fill_value = 'zero')),
    ('imputerH4', SimpleImputer(missing_values = 0, strategy = 'constant', fill_value = 'zero')),
    ('oneHotEncoderH', OneHotEncoder(handle_unknown = 'ignore'))
])

In [8]:
crime_pipeline = Pipeline([
    ('imputerCrime', SimpleImputer(strategy = 'median')),
    ('ScalerCrime', MinMaxScaler())  
])

In [9]:
experience_pipeline = Pipeline([
    ('imputerExperience1', SimpleImputer(strategy = 'median')),
    ('ScalerExperience', MinMaxScaler())  
])

In [10]:
satisfaction_pipeline = Pipeline([
    ('inputerS1', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('oneHotEncoderS', OneHotEncoder(handle_unknown = 'ignore'))
])

In [11]:
#need to check if better to set 'unknown' to 'other'
gender_pipeline = Pipeline([
    ('imputerGender1', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('imputerGender2', SimpleImputer(missing_values = '0', strategy = 'constant', fill_value = 'zero')),
    ('imputerGender3', SimpleImputer(missing_values = 'unknown', strategy = 'constant', fill_value = 'other')),
    ('imputerGender4', SimpleImputer(missing_values = 'f', strategy = 'constant', fill_value = 'female')),
    ('imputerGender5', SimpleImputer(missing_values = 'm', strategy = 'constant', fill_value = 'male')),
    ('oneHotEncoderGender', OneHotEncoder(handle_unknown = 'ignore'))
    ])

In [12]:
age_pipeline = Pipeline([
    ('imputerAge1', SimpleImputer(strategy = "median")),
    ('ScalerAge', MinMaxScaler())  
])

In [13]:
country_pipeline = Pipeline([
    ('imputerCountry1', SimpleImputer(strategy = 'constant', fill_value = 'unknown')),
    ('oneHotEncoderCountry', OneHotEncoder(handle_unknown = 'ignore'))
])

In [14]:
city_pipeline = Pipeline([
    ('imputerCity1', SimpleImputer(strategy = 'median')),
    ('ScalerCity', StandardScaler())
])

In [15]:
profession_pipeline = Pipeline([
    ('inputerP1', SimpleImputer(strategy = 'constant', fill_value = 'unknown')),
    ('inputerP2', SimpleImputer(missing_values = 'Somewhat Happy', strategy = 'constant', fill_value = 'Happy')),
    ('targetEncoderP', TargetEncoder()),
    ('ScalerProfession', StandardScaler())
])

In [16]:
education_pipeline = Pipeline([
    ('imputerEducation1', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('imputerEducation2', SimpleImputer(missing_values = '0', strategy = 'constant', fill_value = 'Zero')),
    ('oneHotEncoderEducation', OneHotEncoder(handle_unknown = 'ignore'))
])

In [17]:
hair_pipeline = Pipeline([
    ('imputerHair1', SimpleImputer(strategy = 'constant', fill_value = 0))
])

In [18]:
height_pipeline = Pipeline([
    ('imputerHeight1', SimpleImputer(strategy = "median")),
    ('ScalerAge', MinMaxScaler())  
])

In [19]:
add_indicator_pipeline = Pipeline([
    ('imputerAdd_ind1', SimpleImputer(strategy = 'constant', fill_value = 0))
])

In [20]:
#combine above pipelines for form final pipeline
full_pipeline = ColumnTransformer([  
    ('year_pipeline', year_pipeline, ['year']),
    ('housing_pipeline', housing_pipeline, ['housing']),
    ('crime_pipeline', crime_pipeline, ['crime']),
    ('experience_pipeline', experience_pipeline, ['experience']),
    ('satisfaction_pipeline', satisfaction_pipeline, ['satisfaction']),
    ('gender_pipeline', gender_pipeline, ['gender']),
    ('age_pipeline', age_pipeline, ['age']),
    ('country_pipeline', country_pipeline, ['country']),
    ('city_pipeline', city_pipeline, ['city']),
    ('profession_pipeline', profession_pipeline, ['profession']),
    ('education_pipeline', education_pipeline, ['education']),
    ('height_pipeline', height_pipeline, ['height']),
    ('add_indicator_pipeline', add_indicator_pipeline, ['add_indicator'])
])

#### set up algorithm - lightgbm

In [21]:
lgbr = lgb.LGBMRegressor(num_leaves = 511, learning_rate = 0.05, n_estimators = 1000)

In [22]:
def func(x):
    return np.log(np.absolute(x))
def inv_func(x):
    return np.exp(x)

In [23]:
regr = TransformedTargetRegressor(regressor = lgbr, func = func, inverse_func = inv_func)

#### read in and split training data

In [24]:
raw_data = input_data("tcd-ml-1920-group-income-train.csv", printInfo = True)
rename_columns(raw_data)
train_set, test_set = split_data(raw_data)

Number of instances: 1048574
Number of freatues: 17

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048574 entries, 0 to 1048573
Data columns (total 17 columns):
Instance                                                    1048574 non-null int64
Year of Record                                              1044561 non-null float64
Housing Situation                                           1048574 non-null object
Crime Level in the City of Employement                      1048574 non-null int64
Work Experience in Current Job [years]                      1048574 non-null object
Satisfation with employer                                   1010487 non-null object
Gender                                                      974447 non-null object
Age                                                         1048574 non-null int64
Country                                                     1048574 non-null object
Size of City                                                1048574 non-null int6

#### fit the model to training data, make predictions on training set and check error

In [25]:
Xtrain, Xtrain_additional = preprocess(train_set, training = True, labelled = True)
ytrain = Xtrain['Y'].copy()
Xtrain.drop(['Y'], axis = 1, inplace = True)

Number of duplicates dropped: 134612
Number of instances after dropping dupicated: 861533



In [26]:
#fit the model using the training data
Xtrain_prepared = full_pipeline.fit_transform(Xtrain, ytrain)

In [27]:
regr.fit(Xtrain_prepared, ytrain)

TransformedTargetRegressor(check_inverse=True,
                           func=<function func at 0x000001D585B90708>,
                           inverse_func=<function inv_func at 0x000001D585B905E8>,
                           regressor=LGBMRegressor(boosting_type='gbdt',
                                                   class_weight=None,
                                                   colsample_bytree=1.0,
                                                   importance_type='split',
                                                   learning_rate=0.05,
                                                   max_depth=-1,
                                                   min_child_samples=20,
                                                   min_child_weight=0.001,
                                                   min_split_gain=0.0,
                                                   n_estimators=1000, n_jobs=-1,
                                                   num_leaves=511,
    

In [28]:
train_predictions = regr.predict(Xtrain_prepared) + Xtrain_additional

In [29]:
train_mae= mean_absolute_error(ytrain + Xtrain_additional, train_predictions)
print(train_mae)

7842.359039263848


#### check performance on test set

In [30]:
Xtest, Xtest_additional = preprocess(test_set, training = False, labelled = True)
ytest = Xtest['Y'].copy()
Xtest.drop('Y', axis=1, inplace = True)

In [31]:
Xtest_prepared = full_pipeline.transform(Xtest)

In [32]:
test_predictions = regr.predict(Xtest_prepared) + Xtest_additional

In [33]:
test_mae = mean_absolute_error(ytest + Xtest_additional, test_predictions)
print(test_mae)

8867.022473992298


#### make competition predictions

In [None]:
Xcomp = input_data("tcd-ml-1920-group-income-test.csv", printInfo = True)
rename_columns(Xcomp)

In [None]:
Xcomp_prepared, Xcomp_additional = preprocess(Xcomp)
Xcomp_prepared = full_pipeline.transform(Xcomp_prepared)

In [None]:
comp_predictions = regr.predict(Xcomp_prepared) + Xcomp_additional

In [None]:
Xcomp['Y'] = comp_predictions
Xcomp.to_csv(r"predictions.csv")