In [None]:
#library imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
#open, combine and clean csv data
def Get_Clean_Combined_Dataset():

    df = pd.read_csv("tcd-ml-1920-group-income-train.csv")
    
    df.drop_duplicates(subset ="Instance", keep = "first", inplace = True)
    df2 = pd.read_csv("tcd-ml-1920-group-income-test.csv")
    df2["Instance"] = df2["Instance"] + 991709
    
    data = pd.concat([df,df2], axis=0)
    data = data.reset_index(drop=True)
    return data

In [None]:
#get dataset
data_set = Get_Clean_Combined_Dataset()

In [None]:
#See the mean income of a feature to see any trends
means = data_set.groupby("Housing Situation")["Total Yearly Income [EUR]"].mean()
print(means)

In [None]:
#target encoder
#catagorical data is replaced with the mean of the means of the feature
#Eg all 'Phd's in University Degree become 93,000
def Target_Encode(df, feature, y_col="Total Yearly Income [EUR]"):
    
    df[feature] = df[feature].replace(np.nan, "Unknown", inplace=False)
    
    means = df.groupby(feature)[y_col].mean()
    
    df[feature] = df[feature].map(means)
    
    return df

In [None]:
#Target encode all the catagorical features
data_set = Target_Encode(data_set, "Profession")
data_set = Target_Encode(data_set, "Satisfation with employer")
#data_set = Target_Encode(data_set, "Country") # Country data needs more cleaning
data_set = Target_Encode(data_set, "Gender")
data_set = Target_Encode(data_set, "University Degree")
data_set = Target_Encode(data_set, "Housing Situation")

#Special case for hair colour - from observation it only maters if it is '0'
#Set this to a bool 
data_set["Hair Color"] = np.where(data_set["Hair Color"] =='0', 1, 0)

#Select features
X = data_set[["Instance", "Profession","Satisfation with employer","University Degree","Hair Color","Housing Situation"]]

#Setup target array
y = data_set[["Instance","Total Yearly Income [EUR]"]]
X.head(30)


In [None]:
#Split the data back out to separate sets on key 'Instance'
df2_X = X.loc[X["Instance"] > 991709]
X = X.loc[X["Instance"] <= 991709]         
#df2_y = y.loc[y["Instance"] > 991709] #not needed
y = y.loc[y["Instance"] <= 991709]

#Drop 'Instance' from datasets not that it is nolonger needed as a key
df2_X = df2_X.drop("Instance",1)
X = X.drop("Instance",1)
y = y.drop("Instance",1)

In [None]:
# Split the test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#Setup BayesianRidge regressor
regr =  BayesianRidge()

# Train the model using the training sets
regr.fit(X_train, (y_train))

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# Print results
print("Mean absolute error: %.2f"
      % mean_absolute_error(y_test, (y_pred)))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, (y_pred)))

In [None]:
# Make predictions on the unlabled dataset
df2_y_pred = regr.predict(df2_X)
# export data
pd.DataFrame(df2_y_pred).to_csv("Predictions.csv")