In [8]:
import numpy as np
import pandas as pd
import pickle as pk
data = pd.read_csv('data/tcd ml 2019-20 income prediction test (without labels).csv')

# source: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
# The top rows were removed to aid in parsing
gdp = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_180634.csv', skiprows=[0, 1, 2, 3])

# Preview and visualise data

In [10]:
data.head(10)

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income
0,111994,1992.0,other,21.0,Honduras,391652,senior project analyst,Master,1,Brown,153,
1,111995,1986.0,other,34.0,Kyrgyzstan,33653,greeter,Bachelor,0,Black,163,
2,111996,1994.0,unknown,53.0,Portugal,34765,liaison,Bachelor,1,Blond,153,
3,111997,1984.0,0,29.0,Uruguay,1494132,occupational therapist,No,0,Black,154,
4,111998,2007.0,other,17.0,Serbia,120661,portfolio manager,No,0,Red,191,
5,111999,2013.0,female,56.0,United Arab Emirates,2223918,sales representative,Master,0,Black,175,
6,112000,2002.0,male,49.0,Slovakia,1753551,senior resiliency planner,No,1,Black,173,
7,112001,2012.0,female,66.0,Kyrgyzstan,467678,interviewer,Bachelor,1,Black,149,
8,112002,2008.0,male,35.0,Togo,817914,quality assurance auditor,,1,Black,205,
9,112003,2003.0,male,49.0,Liberia,290455,office coordinator,No,0,Blond,136,


# Prepare data for ml algorithm

In [5]:
def clean_country(country):
    try:
        return {
            'Laos': 'Lao PDR',
            'Kyrgyzstan': 'Kyrgyz Republic',
            'Slovakia': 'Slovak Republic',
            'Congo': 'Congo, Rep.',
            'DR Congo': 'Congo, Dem. Rep.',
            # Nothing political intended here, trying to be realistic about income
            'State of Palestine': 'Jordan',
            'Syria': 'Syrian Arab Republic',
            'Gambia': 'Gambia, The',
            'North Korea': 'Korea, Dem. People’s Rep.',
            'South Korea': 'Korea, Rep.',
            'Côte d\'Ivoire': 'Cote d\'Ivoire',
            'Venezuela': 'Venezuela, RB',
            'Yemen': 'Yemen, Rep.',
            'Brunei': 'Brunei Darussalam',
            'Micronesia': 'Micronesia, Fed. Sts.',
            'Bahamas': 'Bahamas, The',
            'Saint Lucia': 'St. Lucia',
            'Czechia': 'Czech Republic',
            'Sao Tome & Principe': 'Sao Tome and Principe',
        }[country]
    except KeyError:
        return country

def get_index_equiv(year):
    return year - 1956

In [11]:
data = pd.read_csv('data/tcd ml 2019-20 income prediction test (without labels).csv')
data = data.fillna(value=0)

## 'Instance'
data = data.drop(labels='Instance', axis=1)

## 'Year of Record'
# nothing to do here

## 'Gender'
data = data.replace(to_replace={'Gender': {
        'male': 1,
        'female': -1,
        'other': 0,
        'unknown': 0,
        '0': 0,
        np.nan: 0,
}})
# data['Gender'] = data['Gender'].astype(float)

## 'Age'
# nothing to do here

## 'Country'
# Replace gdp
countries = pd.Series.to_dict(data['Country'].value_counts())
country_gdp = {'Country': {}}

for country in countries:
#     print(countries[country], '->', country)
    try:
        gdpval = np.nan
        year = 2018
        while np.isnan(gdpval):
            gdpval = gdp.loc[gdp['Country Name'] == clean_country(country)].iloc[:, get_index_equiv(year)].item()
            year -= 1
            if year < 1960:
                if country == 'North Korea':
                    gdpval = float(30000000000)
                else:
                    raise IndexError('Could not find GDP')
        country_gdp['Country'][country] = gdpval
    except ValueError:
        print('Country Error, check : clean_country() -> ', country)

data = data.replace(to_replace=country_gdp)

## 'Size of City'
## 'Profession'
jobs = pd.Series.to_dict(data['Profession'].value_counts())
new_jobs = {'Profession': {}}

for key in jobs:
    if jobs[key] > 80:
        try:
            new_key = key[:4]
        except TypeError:
            new_key = 'unkn'
    else:
        new_key = 'unkn'
    new_jobs['Profession'][key] = new_key
    
for key in new_jobs['Profession']:
    try:
        temp = new_jobs['Profession'][new_jobs['Profession'][key]]
        new_jobs['Profession'][key] = 'unkn'
    except KeyError:
        continue

# for key in new_jobs['Profession']:
#     print(key, '->', new_jobs['Profession'][key], ' * ', type(key))

new_jobs = pk.load(open('jobs.pkl', mode='rb'))
data = data.replace(to_replace=new_jobs)
# data.head()

## 'University Degree'
data = data.replace(to_replace={'University Degree': {
        '0': 'No',
        0: 'No',
}})

## 'Wears Glasses'
data = data.drop(labels='Wears Glasses', axis=1)

## 'Hair Color'
data = data.drop(labels='Hair Color', axis=1)

## 'Body Height [cm]'
## 'Income in EUR'



In [12]:
data.head(15)

Unnamed: 0,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Body Height [cm],Income
0,1992.0,0,21.0,23803230000.0,391652,seni,Master,153,0.0
1,1986.0,0,34.0,8092837000.0,33653,unkn,Bachelor,163,0.0
2,1994.0,0,53.0,237978900000.0,34765,liai,Bachelor,153,0.0
3,1984.0,0,29.0,59596890000.0,1494132,occu,No,154,0.0
4,2007.0,0,17.0,50508370000.0,120661,port,No,191,0.0
5,2013.0,-1,56.0,414178900000.0,2223918,sale,Master,175,0.0
6,2002.0,1,49.0,106472200000.0,1753551,seni,No,173,0.0
7,2012.0,-1,66.0,8092837000.0,467678,inte,Bachelor,149,0.0
8,2008.0,1,35.0,5300214000.0,817914,qual,No,205,0.0
9,2003.0,1,49.0,3249000000.0,290455,offi,No,136,0.0


In [16]:
data = pd.get_dummies(data, columns=['Profession', 'University Degree'], sparse=True)
data = data.astype(float)

In [17]:
# data.info()
# corr_matrix = data.corr()
# corr_matrix['Income in EUR']

In [18]:
from sklearn.preprocessing import scale
X = data.drop(columns=['Income in EUR']).to_numpy()
y = data.loc[:, 'Income in EUR'].to_numpy()

Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12120)

# Train algorithm

In [20]:
from sklearn.linear_model import LinearRegression # N/A
from sklearn.tree import DecisionTreeRegressor # 0.5144117272297519
from sklearn.ensemble import RandomForestRegressor # 0.6767255620150188
from sklearn.neighbors import KNeighborsRegressor # -0.15649302917615013
from sklearn.neural_network import MLPRegressor # 0.009245706360739738
from sklearn.gaussian_process import GaussianProcessRegressor

func = RandomForestRegressor()#verbose=True)
func.fit(X_train, y_train)
func.score(X_test, y_test)



0.6605956887688516

In [None]:
func.predict(X[:5])

In [None]:
pk.dump(func, open('dump.pkl', mode='wb'))