In [48]:
import time
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format

# %matplotlib notebook
import seaborn as sns
sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

class Data:
    #storage class for data
    def __init__(self, filename, subset_count=None):
        x = pd.read_csv(filename, index_col='id')
        if subset_count is not None: # Nice to use for testing
            permut = np.random.permutation(x.shape[0])[:subset_count]
            x = x.iloc[permut, :]
        
        self.df_y = x['loss']
        y = x['loss'].values
        del x['loss']

        convertedX = pd.get_dummies(x, drop_first=True)
        X = convertedX.values
        self.X = X
        self.y = y
        
        self.df_X = convertedX
        self.df_x_test = None
    
    def get_split(self, test_size=0.20, pca_components=None, nmf_components=None):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=test_size)
        if pca_components is not None:
            pca = PCA(n_components = pca_components)
            pca.fit(X_train)
            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)
        elif nmf_components:
            nmf = NMF(n_components=nmf_components)
            X_train = nmf.fit_transform(X)
            X_test = nmf.transform(X_test)
        return  X_train, X_test, y_train, y_test
    
    def remove_columns(self, columns):
        # TODO:
        pass
    
    def read_test_data(self, filename):
        X_test = pd.read_csv(filename, index_col='id')
        X_test = pd.get_dummies(X_test, drop_first=True)

        additional_columns = set(X_test.columns) - set(self.df_X.columns)
        X_test = X_test.drop(columns=additional_columns)

        missing_columns = set(self.df_X.columns) - set(X_test.columns)
        for col in missing_columns:
            X_test[col] = 0
            
        self.df_X_test = X_test
        return X_test.values
    
def evaluate(name, estimator, X_train, X_test, y_train, y_test):
    t_0 = time.time()
    print(f'{name}:')
    estimator.fit(X_train, y_train)
    
    t_1 = time.time()
    print(f'\tTime elapsed for model construction {t_1 - t_0:.3f} sec')
    y_test_predict = estimator.predict(X_test)
    error_test = mean_absolute_error(y_test, y_test_predict)
    error_train = mean_absolute_error(y_train, estimator.predict(X_train))
    print(f'\tTime elapsed for prediction {time.time() - t_1:.3f} sec')
    print(f'\tTest error: {error_test:.3f}')
    print(f'\tTrain error: {error_train:.3f}')
    return error_test

In [55]:
# Reading in the final test data
data = Data("train.csv")
X_train, y_train = data.df_X, data.df_y
X_test = data.read_test_data("test.csv")
X_test = data.df_X_test

X_train.info()
print()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188318 entries, 1 to 587633
Columns: 1037 entries, cont1 to cat116_Y
dtypes: float64(14), uint8(1023)
memory usage: 205.3 MB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125546 entries, 4 to 587634
Columns: 1037 entries, cont1 to cat113_T
dtypes: float64(14), int64(74), uint8(949)
memory usage: 198.9 MB


In [37]:
data = Data("train.csv", 500)
data.remove_columns([])
X_train, X_test, y_train, y_test = data.get_split()

ESTIMATORS = {
    "LinearRegression": LinearRegression(n_jobs=-1),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "ElasticNet": ElasticNet(),
#     # non-linear
#     "SVR": SVR(),
#     "BaggingRegressor": BaggingRegressor(n_jobs=-1),
#     "ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
#     "RandomForestRegressor_50": RandomForestRegressor(n_jobs=-1),
#     "RandomForestRegressor_500": RandomForestRegressor(n_jobs=-1),
#     "GradientBoostingRegressor": GradientBoostingRegressor(loss='huber'),
#     "KNeighborsRegressor": KNeighborsRegressor(n_jobs=-1),
#     "MLP":  MLPRegressor(hidden_layer_sizes=(150,)),
}

for name, estimator in ESTIMATORS.items():
    evaluate(name, estimator, X_train, X_test, y_train, y_test)

LinearRegression:
	Time elapsed for model construction 0.031 sec
	Time elapsed for prediction 0.000 sec
	Test error: 13717.217
	Train error: 0.000
Ridge:
	Time elapsed for model construction 0.016 sec
	Time elapsed for prediction 0.000 sec
	Test error: 1517.111
	Train error: 626.490
Lasso:
	Time elapsed for model construction 0.125 sec
	Time elapsed for prediction 0.000 sec
	Test error: 1741.317
	Train error: 484.812
ElasticNet:
	Time elapsed for model construction 0.000 sec
	Time elapsed for prediction 0.000 sec
	Test error: 1326.086
	Train error: 1335.727
SVR:
	Time elapsed for model construction 0.103 sec
	Time elapsed for prediction 0.114 sec
	Test error: 1493.393
	Train error: 1790.380
BaggingRegressor:
	Time elapsed for model construction 4.706 sec
	Time elapsed for prediction 3.408 sec
	Test error: 1337.031
	Train error: 561.096
ExtraTreesRegressor:
	Time elapsed for model construction 0.141 sec
	Time elapsed for prediction 0.223 sec
	Test error: 1513.540
	Train error: 0.000
Ran

