In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
def print_result(model, x_val, y_val):
    y_pred_val = model.predict(x_val)
    print('---Test---')
    print('MAE: ', metrics.mean_absolute_error(y_val, y_pred_val))
    print('MSE: ', metrics.mean_squared_error(y_val, y_pred_val))
    print('R2: ', metrics.r2_score(y_val, y_pred_val))


def Linear(x_train, y_train):
    model = LinearRegression()
    model.fit(x_train, y_train)
    return model

In [3]:
x_train = pd.read_csv('./data/data_cleaned_train_X.csv')
y_train = pd.read_csv('./data/data_cleaned_train_y.csv')

x_val = pd.read_csv('./data/data_cleaned_val_X.csv')
y_val = pd.read_csv('./data/data_cleaned_val_y.csv')

x_test = pd.read_csv('./data/data_cleaned_test_X.csv')
y_test = pd.read_csv('./data/data_cleaned_test_y.csv')

In [4]:
coeffs = np.load('./selected_coefs.npy')
col_set = set()

In [5]:
# Linear Regression model
linear_model = Linear(x_train, y_train)
print_result(linear_model, x_train, y_train)
print_result(linear_model, x_val, y_val)
print_result(linear_model, x_test, y_test)

---Test---
MAE:  56.75002496117151
MSE:  31974.49539844848
R2:  0.32969296452055774
---Test---
MAE:  627919062.4057397
MSE:  7.898411318659448e+20
R2:  -1.094185646360482e+16
---Test---
MAE:  319822139.67199886
MSE:  2.5576201443614946e+20
R2:  -7761525684062291.0


In [6]:
for i in range(len(coeffs)):
    if (coeffs[i]):
        col_set.add(x_train.columns[i])
        
x_train = x_train[list(col_set)]
x_val = x_val[list(col_set)]
x_test = x_test[list(col_set)]

x_concat = pd.concat([x_train, x_val], ignore_index=True)
y_concat = pd.concat([y_train, y_val], ignore_index=True)

In [7]:
linear_model_lasso = Linear(x_concat, y_concat)
print_result(linear_model_lasso, x_concat, y_concat)
print_result(linear_model_lasso, x_val, y_val)

---Test---
MAE:  62.86412266232709
MSE:  44235.3351863953
R2:  0.12269465152288905
---Test---
MAE:  70.86688141981416
MSE:  65104.974278115165
R2:  0.09808535555131559


In [5]:
x_selected_train = pd.read_csv('./data/data_selected_train_X.csv')
y_selected_train = pd.read_csv('./data/data_selected_train_y.csv')

x_selected_val = pd.read_csv('./data/data_selected_val_X.csv')
y_selected_val = pd.read_csv('./data/data_selected_val_y.csv')

x_selected_test = pd.read_csv('./data/data_selected_test_X.csv')
y_selected_test = pd.read_csv('./data/data_selected_test_y.csv')

In [6]:
linear_model_forest = Linear(x_selected_train, y_selected_train)
print_result(linear_model_forest, x_selected_train, y_selected_train)
print_result(linear_model_forest, x_selected_val, y_selected_val)

---Test---
MAE:  58.71252601909456
MSE:  37892.32024206508
R2:  0.20563284791888292
---Test---
MAE:  219641.11237827103
MSE:  5452864952403.099
R2:  -75539830.00837365
