In [1]:
import numpy as np
import pandas as pd
import pickle as pk
data = pd.read_csv('data/tcd ml 2019-20 income prediction test (without labels).csv')

# source: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
# The top rows were removed to aid in parsing
gdp = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_180634.csv', skiprows=[0, 1, 2, 3])

# Prepare data for ml algorithm

In [17]:
def clean_country(country):
    try:
        return {
            'Laos': 'Lao PDR',
            'Kyrgyzstan': 'Kyrgyz Republic',
            'Slovakia': 'Slovak Republic',
            'Congo': 'Congo, Rep.',
            'DR Congo': 'Congo, Dem. Rep.',
            # Nothing political intended here, trying to be realistic about income
            'State of Palestine': 'Jordan',
            'Syria': 'Syrian Arab Republic',
            'Gambia': 'Gambia, The',
            'North Korea': 'Korea, Dem. People’s Rep.',
            'South Korea': 'Korea, Rep.',
            'Côte d\'Ivoire': 'Cote d\'Ivoire',
            'Venezuela': 'Venezuela, RB',
            'Yemen': 'Yemen, Rep.',
            'Brunei': 'Brunei Darussalam',
            'Micronesia': 'Micronesia, Fed. Sts.',
            'Bahamas': 'Bahamas, The',
            'Saint Lucia': 'St. Lucia',
            'Czechia': 'Czech Republic',
            'Sao Tome & Principe': 'Sao Tome and Principe',
        }[country]
    except KeyError:
        return country

def get_index_equiv(year):
    return year - 1956

In [18]:
from sklearn.preprocessing import LabelBinarizer
data = pd.read_csv('data/tcd ml 2019-20 income prediction test (without labels).csv')
gdp = pd.read_csv('data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_180634.csv', skiprows=[0, 1, 2, 3])

data = data.fillna(value=0)

## 'Instance'
instances = data.loc[:, 'Instance']
data = data.drop(labels='Instance', axis=1)

## 'Year of Record'
# nothing to do here

## 'Gender'
data = data.replace(to_replace={'Gender': {
        'male': 1,
        'female': -1,
        'other': 0,
        'unknown': 0,
        '0': 0,
        np.nan: 0,
}})

## 'Age'
# nothing to do here

## 'Country'
# Replace country with its GDP
countries = pd.Series.to_dict(data['Country'].value_counts())
country_gdp = {'Country': {}}

for country in countries:
    try:
        gdpval = np.nan
        year = 2018
        while np.isnan(gdpval):
            gdpval = gdp.loc[gdp['Country Name'] == clean_country(country)].iloc[:, get_index_equiv(year)].item()
            year -= 1
            if year < 1960:
                if country == 'North Korea':
                    gdpval = float(32000000000) # There isn't World Bank for NK, assume it's USD32 Billion
                else:
                    raise IndexError('Could not find GDP')
        country_gdp['Country'][country] = gdpval
    except ValueError:
        print('Country Error, check : clean_country() -> ', country)

data = data.replace(to_replace=country_gdp)

## 'Size of City'
# nothing to do here

## 'Profession'
# shorten labels
jobs = pd.Series.to_dict(data['Profession'].value_counts())
new_jobs = {'Profession': {}}

for key in jobs:
    if jobs[key] > 80:
        try:
            new_key = key[:4]
        except TypeError:
            new_key = 'unkn'
    else:
        new_key = 'unkn'
    new_jobs['Profession'][key] = new_key
    
for key in new_jobs['Profession']:
    try:
        temp = new_jobs['Profession'][new_jobs['Profession'][key]]
        new_jobs['Profession'][key] = 'unkn'
    except KeyError:
        continue

new_jobs = pk.load(open('jobs.pkl', mode='rb'))
data = data.replace(to_replace=new_jobs)

# one-hot encode 

jobs_encoder = pk.load(open('jobs_encoder.pkl', mode='rb'))

transformed = jobs_encoder.transform(data['Profession'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)

## 'University Degree'
data = data.replace(to_replace={'University Degree': {
        '0': 'No',
        0: 'No',
}})

degree_encoder = LabelBinarizer()
degree_encoder.fit(data['University Degree'])
degree_encoder = pk.load(open('degree_encoder.pkl', mode='rb'))

transformed = degree_encoder.transform(data['University Degree'])
ohe_df = pd.DataFrame(transformed)
data = pd.concat([data, ohe_df], axis=1).drop(['University Degree'], axis=1)

## 'Wears Glasses'
data = data.drop(labels='Wears Glasses', axis=1)

## 'Hair Color'
data = data.drop(labels='Hair Color', axis=1)

## 'Body Height [cm]'
## 'Income in EUR'



In [19]:
from sklearn.preprocessing import scale
X = data.drop(columns=['Income'])#.to_numpy()

# Train algorithm

In [20]:
func = pk.load(open('predictor.pkl', mode='rb'))
# func.score(X, y)

In [32]:
# np.savetxt("output.csv", func.predict(X), delimiter=",")
ans = func.predict(X)
ins = instances.to_numpy()
output = []
assert(len(ans) == len(ins))
for i in range(len(ans)):
    output.append([ins[i], ans[i]])
# print(output)
np.savetxt('output.csv', output, delimiter=',', fmt='%d,%.2f', header='Instance,Income', comments='')

In [None]:
print(pd.__version__)