In [4]:
import numpy as np
import pandas as pd
import pickle as pk
datapath = 'data/tcd ml 2019-20 income prediction training (with labels).csv'

# source: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
# The top rows were removed to aid in parsing
gdp = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_180634.csv', skiprows=[0, 1, 2, 3])

# Prepare data for ml algorithm

In [103]:
# Some helper functions
from sklearn.preprocessing import LabelBinarizer

def clean_country(country):
    try:
        return {
            'Laos': 'Lao PDR',
            'Kyrgyzstan': 'Kyrgyz Republic',
            'Slovakia': 'Slovak Republic',
            'Congo': 'Congo, Rep.',
            'DR Congo': 'Congo, Dem. Rep.',
            # Nothing political intended here, trying to be realistic about income
            'State of Palestine': 'Jordan',
            'Syria': 'Syrian Arab Republic',
            'Gambia': 'Gambia, The',
            'North Korea': 'Korea, Dem. People’s Rep.',
            'South Korea': 'Korea, Rep.',
            'Côte d\'Ivoire': 'Cote d\'Ivoire',
            'Venezuela': 'Venezuela, RB',
            'Yemen': 'Yemen, Rep.',
            'Brunei': 'Brunei Darussalam',
            'Micronesia': 'Micronesia, Fed. Sts.',
            'Bahamas': 'Bahamas, The',
            'Saint Lucia': 'St. Lucia',
            'Czechia': 'Czech Republic',
            'Sao Tome & Principe': 'Sao Tome and Principe',
        }[country]
    except KeyError:
        return country

def get_index_equiv(year):
    return year - 1956

def increment_year(year, subtract=True):
    if subtract == True:
        return year - 1
    else:
        return year + 1

def country_to_gdp(data):
    countries = pd.Series.to_dict(data['Country'])
    years = pd.Series.to_dict(data['Year of Record'])
    assert(len(countries) == len(years))
    
    return_val = {}
    for i in range(len(countries)):
        try:
            gdpval = np.nan
            year = years[i]
            subtract = True
            while np.isnan(gdpval):
                gdpval = gdp.loc[
                    gdp['Country Name'] == 
                                 clean_country(countries[i])].iloc[:, 
                                                                   get_index_equiv(int(year))
                                                                  ].item()
                year = increment_year(year, subtract)
                if year < 1960:
                    if countries[i] == 'North Korea':
                        # There isn't World Bank data for NK, let's assume it's USD32 Billion
                        gdpval = float(32000000000)
                    else:
                        subtract = False
                        year = years[i]
                elif (year > 2017) and (subtract == False):
                    raise IndexError('Could not find GDP: ', countries[i], years[i])
            return_val[i] = gdpval
        except ValueError:
            print('Country Error, check : clean_country() -> ', countries[i], years[i])
            raise
#     print(return_val)
#     raise KeyboardInterrupt()
    return pd.DataFrame(return_val, index=[0]).transpose()

def shorten_jobs(jobs, length=8, thresh=80):
    return_val = {}
    for key in jobs:
        if jobs[key] > thresh:
            try:
                new_key = key[:length]
            except TypeError:
                new_key = 'unkn'
        else:
            new_key = 'unkn'
        return_val[key] = new_key
    
    # Make sure that there aren't any duplicates
    for key in return_val:
        try:
            return_val[return_val[key]]
            return_val[key] = 'unkn'
        except KeyError:
            continue
    
    return return_val

def one_hot_encode(data, column):
    degree_encoder = LabelBinarizer()
    degree_encoder.fit(data[column])
    pk.dump(degree_encoder, open(column.replace(' ', '_') + '.pkl', mode='wb'))
    transformed = degree_encoder.transform(data[column])
    ohe_df = pd.DataFrame(transformed)
    return pd.concat([data, ohe_df], axis=1).drop([column], axis=1)

In [106]:
def process_data(path, 
                 Instance_drop, 
                 Year_of_Record_drop, 
                 Gender_drop, 
                 Age_drop, 
                 Country_drop, 
                 Size_of_City_drop, 
                 Profession_drop, 
                 University_Degree_drop, 
                 Wears_Glasses_drop, 
                 Hair_Color_drop, 
                 Body_Height_cm_drop,
                 prof_len=4,
                 prof_cutoff=80,
):
    from sklearn.preprocessing import LabelBinarizer

    data = pd.read_csv(path)
    data = data.fillna(value=0)

    ## 'Instance'
    if Instance_drop == True:
        data = data.drop(labels='Instance', axis=1)
    else:
        pass

    ## 'Year of Record'
    if Year_of_Record_drop == True:
        data = data.drop(labels='Year of Record', axis=1)
    else:
        data = data.replace(to_replace={'Year of Record': {
                0: 2018.0,
                np.nan: 2018.0,
        }})

    ## 'Gender'
    if Gender_drop == True:
        data = data.drop(labels='Gender', axis=1)
    else:
        data = data.replace(to_replace={'Gender': {
                'male': 1,
                'female': -1,
                'other': 0,
                'unknown': 0,
                '0': 0,
                np.nan: 0,
        }})

    ## 'Age'
    if Age_drop == True:
        data = data.drop(labels='Age', axis=1)
    else:
        pass

    ## 'Country'
    # Replace country with its GDP
    if Country_drop == True:
        data = data.drop(labels='Country', axis=1)
    else:
        data['Country'] = country_to_gdp(data)

    ## 'Size of City'
    if Size_of_City_drop == True:
        data = data.drop(labels='Size of City', axis=1)
    else:
        pass

    ## 'Profession'
    if Profession_drop == True:
        data = data.drop(labels='Profession', axis=1)
    else:
        # shorten labels
        new_jobs = {'Profession': 
                    shorten_jobs(
                        pd.Series.to_dict(data['Profession'].value_counts()), 
                        prof_len,
                        prof_cutoff
                    )}
        pk.dump(new_jobs, open('jobs.pkl', mode='wb'))
        data = data.replace(to_replace=new_jobs)

        # one-hot encode 
        data = one_hot_encode(data, 'Profession')

    ## 'University Degree'
    if University_Degree_drop == True:
        data = data.drop(labels='University Degree', axis=1)
    else:
        data = data.replace(to_replace={'University Degree': {
                '0': 'No',
                0: 'No',
                np.nan: 'No',
        }})

        data = one_hot_encode(data, 'University Degree')

    ## 'Wears Glasses'
    if Wears_Glasses_drop == True:
        data = data.drop(labels='Wears Glasses', axis=1)
    else:
        pass

    ## 'Hair Color'
    if Hair_Color_drop == True:
        data = data.drop(labels='Hair Color', axis=1)
    else:
        data = data.replace(to_replace={'Hair Color': {
                np.nan: 'Unknown',
                '0': 'Unknown',
                0: 'Unknown',
        }})
        data = one_hot_encode(data, 'Hair Color')

    ## 'Body Height [cm]'
    if Body_Height_cm_drop == True:
        data = data.drop(labels='Body Height [cm]', axis=1)
    else:
        pass

    ## 'Income in EUR'
    # nothing to do here
    
    return data

def split__scale_data(data):
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split

    X = data.drop(columns=['Income in EUR'])#.to_numpy()
    y = data.loc[:, 'Income in EUR']#.to_numpy()

    scaler = StandardScaler()
    scaler.fit(X)
    pk.dump(scaler, open('scaler.pkl', mode='wb'))

    X = scaler.transform(X)
    return train_test_split(X, y, test_size=0.2, random_state=12120)

In [None]:
data = process_data(
    datapath, 
    
    Instance_drop           = True,
    Year_of_Record_drop     = False,
    Gender_drop             = False,
    Age_drop                = False,
    Country_drop            = False,
    Size_of_City_drop       = True,
    Profession_drop         = True,
    University_Degree_drop  = False, # potentially drop
    Wears_Glasses_drop      = True,
    Hair_Color_drop         = True,
    Body_Height_cm_drop     = False,
    
    prof_len                = 8,
    prof_cutoff             = 80,
)
X_train, X_test, y_train, y_test = split__scale_data(data)

from sklearn.ensemble import RandomForestRegressor

func = RandomForestRegressor(
    n_estimators=10, 
    n_jobs=-1,
    random_state=2645, # for consistency between runs
    verbose=0
)
func.fit(X_train, y_train)
print('Score: ', func.score(X_test, y_test))




# Train algorithm

In [5]:
# from sklearn.linear_model import LinearRegression # N/A
# from sklearn.tree import DecisionTreeRegressor # 0.5144117272297519
# from sklearn.ensemble import RandomForestRegressor # 0.6767255620150188
# from sklearn.neighbors import KNeighborsRegressor # -0.15649302917615013
# from sklearn.neural_network import MLPRegressor # 0.009245706360739738
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.svm import SVR



In [6]:
pk.dump(func, open('predictor.pkl', mode='wb'))

In [None]:
# pk.dump(func, open('predictor.pkl', mode='wb'))