In [None]:
#importing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, KBinsDiscretizer


In [None]:

def Get_Clean_Combined_Dataset():

    df = pd.read_csv("tcd-ml-1920-group-income-train.csv")
    df
    
    df.drop_duplicates(subset ="Instance", 
                     keep = False, inplace = True)
    df2 = pd.read_csv("tcd-ml-1920-group-income-test.csv")
    df2["Instance"] = df2["Instance"] + 991709
    
    data = pd.concat([df,df2], axis=0)
    data = data.reset_index(drop=True)
    return data

In [None]:

data_set = Get_Clean_Combined_Dataset()

In [None]:

#Split data into income set (Incuding instance so that the unlabled data can extracted later)
y = data_set[["Instance","Total Yearly Income [EUR]"]].copy()
data_set.head()

#And all the Feature data (Incuding instance so that the unlabled data can extracted later)
X = data_set[["Instance","Year of Record","Age","Size of City","Profession","University Degree","Country","Gender"]].copy()
data_set.head()

In [None]:
#Function to relpace invalid entries in numeric features with the mean walue of the particular feature
def ReplaceNan_Numeric(df,F_Name):
    average = df[F_Name].dropna().mean(axis=0)
    df[F_Name] = df[F_Name].replace(np.nan, average, inplace=False)
    return df

In [None]:
# Function to onehot encode catagorical features
def oneHotEncode_Feature(df,F_Name):
    feature = df[F_Name]
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(feature)
    
    #binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    feature_onehot = pd.DataFrame(onehot_encoded)
    df = df.drop(F_Name,1)   #Feature Matrix
    new_df = pd.concat([df, feature_onehot], axis=1)
    return new_df

In [None]:
# Function to onehot encode catagorical features
def LabelEncode_Feature(df,F_Name):
    feature = df[F_Name]
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(feature)

    feature_e = pd.DataFrame(integer_encoded)
    
    df = df.drop(F_Name,1)   #Feature Matrix
    new_df = pd.concat([df, feature_e], axis=1)
    return new_df

In [None]:
# Function to convert non-linear numeric data into 'bins' of nearby values, bins represented with onehot encoding
def KBins_Feature(df, F_Name):
    feature_array = df[F_Name].to_numpy()
    feature_array = feature_array.reshape(-1, 1)
    est = KBinsDiscretizer(n_bins=8, encode='onehot-dense', strategy='quantile')
    est.fit(feature_array)
    onehot_encoded = est.transform(feature_array)
    
    onehot_feature_df = pd.DataFrame(onehot_encoded)
    df = df.drop(F_Name,1)   #Feature Matrix
    new_df = pd.concat([df, onehot_feature_df], axis=1)
    return new_df

In [None]:

def MinMaxScale_Feature(df, F_Name):
    feature_array = df[F_Name].to_numpy()
    feature_array = feature_array.reshape(-1, 1)
    
    scaler = MinMaxScaler()
    scaled_array = scaler.fit_transform(feature_array)
    
    scaled_df = pd.DataFrame(scaled_array)
    df = df.drop(F_Name,1)   #Feature Matrix
    new_df = pd.concat([df, scaled_df], axis=1)
    return new_df

In [None]:
# Clean numeric data
X = ReplaceNan_Numeric(X,"Year of Record")
X = ReplaceNan_Numeric(X,"Age")
X = ReplaceNan_Numeric(X,"Size of City")

In [None]:
# Scale numeric data
X = MinMaxScale_Feature(X,"Year of Record")
X = MinMaxScale_Feature(X,"Age")

In [None]:
# Split Size of cities into bins 
X = KBins_Feature(X,"Size of City")

In [None]:
## Clean and onehot encode categorical features
X["Profession"] = X["Profession"].replace(np.nan, "Unknown", inplace=False)
X = LabelEncode_Feature(X,"Profession")

X["Gender"] = X["Gender"].replace(np.nan, "unknown", inplace=False)
X = LabelEncode_Feature(X,"Gender")

X["Country"] =  X["Country"].replace(np.nan, "Unknown", inplace=False)
X = oneHotEncode_Feature(X,"Country")

X["University Degree"] = X["University Degree"].replace(np.nan, "Unknown", inplace=False)
X = LabelEncode_Feature(X,"University Degree")

In [None]:
#Split the data back out to separate sets on key 'Instance'
df2_X = X.loc[X["Instance"] > 991709]
X = X.loc[X["Instance"] <= 991709]         
#df2_y = y.loc[y["Instance"] > 991709] #not needed
y = y.loc[y["Instance"] <= 991709]

#Drop 'Instance' from datasets not that it is nolonger needed as a key
df2_X = df2_X.drop("Instance",1)
X = X.drop("Instance",1)
y = y.drop("Instance",1)

In [None]:
# Split the test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create ridgecv regression object with 4 crossvalidation folds
regr = BayesianRidge()

# Train the model using the training sets
regr.fit(X_train, (y_train))

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# Print results
print("Mean absolute error: %.2f"
      % mean_absolute_error(y_test, (y_pred)))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, (y_pred)))

In [None]:
# Make predictions on the unlabled dataset
df2_y_pred = regr.predict(df2_X)
# export data
pd.DataFrame(df2_y_pred).to_csv("Predictions.csv")