In [1]:
import numpy as np
import pandas as pd
import pickle as pk
data = pd.read_csv('data/tcd ml 2019-20 income prediction training (with labels).csv')

# source: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
# The top rows were removed to aid in parsing
gdp = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_180634.csv', skiprows=[0, 1, 2, 3])

# Prepare data for ml algorithm

In [9]:
def clean_country(country):
    try:
        return {
            'Laos': 'Lao PDR',
            'Kyrgyzstan': 'Kyrgyz Republic',
            'Slovakia': 'Slovak Republic',
            'Congo': 'Congo, Rep.',
            'DR Congo': 'Congo, Dem. Rep.',
            # Nothing political intended here, trying to be realistic about income
            'State of Palestine': 'Jordan',
            'Syria': 'Syrian Arab Republic',
            'Gambia': 'Gambia, The',
            'North Korea': 'Korea, Dem. People’s Rep.',
            'South Korea': 'Korea, Rep.',
            'Côte d\'Ivoire': 'Cote d\'Ivoire',
            'Venezuela': 'Venezuela, RB',
            'Yemen': 'Yemen, Rep.',
            'Brunei': 'Brunei Darussalam',
            'Micronesia': 'Micronesia, Fed. Sts.',
            'Bahamas': 'Bahamas, The',
            'Saint Lucia': 'St. Lucia',
            'Czechia': 'Czech Republic',
            'Sao Tome & Principe': 'Sao Tome and Principe',
        }[country]
    except KeyError:
        return country

def get_index_equiv(year):
    return year - 1956

In [10]:
from sklearn.preprocessing import LabelBinarizer

data = pd.read_csv('data/tcd ml 2019-20 income prediction training (with labels).csv')
data = data.fillna(value=0)

## 'Instance'
data = data.drop(labels='Instance', axis=1)

## 'Year of Record'
# nothing to do here

## 'Gender'
data = data.replace(to_replace={'Gender': {
        'male': 1,
        'female': -1,
        'other': 0,
        'unknown': 0,
        '0': 0,
        np.nan: 0,
}})

## 'Age'
# nothing to do here

## 'Country'
# Replace country with its GDP
countries = pd.Series.to_dict(data['Country'].value_counts())
country_gdp = {'Country': {}}

for country in countries:
    try:
        gdpval = np.nan
        year = 2018
        while np.isnan(gdpval):
            gdpval = gdp.loc[gdp['Country Name'] == clean_country(country)].iloc[:, get_index_equiv(year)].item()
            year -= 1
            if year < 1960:
                if country == 'North Korea':
                    gdpval = float(32000000000) # There isn't World Bank for NK, assume it's USD32 Billion
                else:
                    raise IndexError('Could not find GDP')
        country_gdp['Country'][country] = gdpval
    except ValueError:
        print('Country Error, check : clean_country() -> ', country)

data = data.replace(to_replace=country_gdp)

## 'Size of City'
# nothing to do here

## 'Profession'
# shorten labels
jobs = pd.Series.to_dict(data['Profession'].value_counts())
new_jobs = {'Profession': {}}

for key in jobs:
    if jobs[key] > 80:
        try:
            new_key = key[:4]
        except TypeError:
            new_key = 'unkn'
    else:
        new_key = 'unkn'
    new_jobs['Profession'][key] = new_key
    
for key in new_jobs['Profession']:
    try:
        temp = new_jobs['Profession'][new_jobs['Profession'][key]]
        new_jobs['Profession'][key] = 'unkn'
    except KeyError:
        continue

pk.dump(new_jobs, open('jobs.pkl', mode='wb'))
data = data.replace(to_replace=new_jobs)

# one-hot encode 

jobs_encoder = LabelBinarizer()
jobs_encoder.fit(data['Profession'])
pk.dump(jobs_encoder, open('jobs_encoder.pkl', mode='wb'))

transformed = jobs_encoder.transform(data['Profession'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)

## 'University Degree'
data = data.replace(to_replace={'University Degree': {
        '0': 'No',
        0: 'No',
}})

degree_encoder = LabelBinarizer()
degree_encoder.fit(data['University Degree'])
pk.dump(degree_encoder, open('degree_encoder.pkl', mode='wb'))

transformed = degree_encoder.transform(data['University Degree'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['University Degree'], axis=1)

## 'Wears Glasses'
data = data.drop(labels='Wears Glasses', axis=1)

## 'Hair Color'
data = data.drop(labels='Hair Color', axis=1)

## 'Body Height [cm]'
## 'Income in EUR'

Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


In [11]:
# corr_matrix = data.corr()
# corr_matrix['Country']
# data.head()
# print(country_gdp)

In [12]:
X = data.drop(columns=['Income in EUR'])#.to_numpy()
y = data.loc[:, 'Income in EUR']#.to_numpy()

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
pk.dump(scaler, open('scaler.pkl', mode='wb'))

X = scaler.transform(X)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12120)

# Train algorithm

In [20]:
from sklearn.linear_model import LinearRegression # N/A
from sklearn.tree import DecisionTreeRegressor # 0.5144117272297519
from sklearn.ensemble import RandomForestRegressor # 0.6767255620150188
from sklearn.neighbors import KNeighborsRegressor # -0.15649302917615013
from sklearn.neural_network import MLPRegressor # 0.009245706360739738
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR

func = RandomForestRegressor(
    n_estimators=50, 
    n_jobs=-1,
    random_state=2645, # for consistency between runs
    verbose=1
)
func.fit(X_train, y_train)
func.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.1s finished


0.6907547769724358

In [None]:
pk.dump(func, open('predictor.pkl', mode='wb'))

In [None]:
# pk.dump(func, open('predictor.pkl', mode='wb'))