In [146]:
# Import necessary libraries
import pandas as pd

In [147]:
# Load dataset 
X_train = pd.read_csv('X_train.csv', index_col='id')
y_train = pd.read_csv('y_train.csv', index_col='id')
X_test = pd.read_csv('X_test.csv', index_col='id')

In [148]:
print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)

Shape of X_train: (1212, 832)
Shape of X_test: (776, 832)


# Preprocessing

## Standardization

In [149]:
from sklearn.preprocessing import RobustScaler

# Scales features using statistics that are robust to outliers
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

## Imputation of missing values

In [150]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train_imp = imputer.fit_transform(X_train_scaled)
X_test_imp = imputer.transform(X_test_scaled)

X_train_imp = pd.DataFrame(X_train_imp, columns=X_train.columns)
X_test_imp = pd.DataFrame(X_test_imp, columns=X_test.columns)

## Outlier detection

In [156]:
from sklearn.decomposition import PCA

n_com = 200
pca = PCA(n_components=n_com)
X_train_pca = pca.fit_transform(X_train_imp)
X_test_pca = pca.transform(X_test_imp)

X_train_pca = pd.DataFrame(X_train_pca, columns=X_train.columns[:n_com])
X_test_pca = pd.DataFrame(X_test_pca, columns=X_test.columns[:n_com])

In [166]:
from sklearn.covariance import EllipticEnvelope

#cov = EllipticEnvelope()
#cov.fit(X_train_pca)
#X_train_out = cov.predict(X_train_pca)

## Feature selection

# Regression models

## Linear Regression

In [133]:
# Linear Regression
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(X_train_pca, y_train)
y_pred = lin.predict(X_train_pca)

In [134]:
# Evaluation of Linear Regression
from sklearn.metrics import r2_score
score = r2_score(y_train, y_pred)

print('Score Linear Regression:', score)

Score Linear Regression: 0.5919262649851431


## Ridge Regression

In [168]:
# Ridge Regression
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.5)
ridge.fit(X_train_imp, y_train)
y_pred = ridge.predict(X_train_imp)

In [159]:
# Evaluation of Ridge Regression
from sklearn.metrics import r2_score
score = r2_score(y_train, y_pred)

print('Score Ridge Regression:', score)

Score Ridge Regression: 0.4678228725825109


## Lasso Regression

In [137]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_pca, y_train)
y_pred = lasso.predict(X_train_pca)

In [138]:
# Evaluation of Lasso Regression
from sklearn.metrics import r2_score
score = r2_score(y_train, y_pred)

print('Score Lasso Regression:', score)

Score Lasso Regression: 0.528365437251739


## Bayesian Ridge Regression

In [139]:
# Bayesian Ridge Regression
from sklearn.linear_model import BayesianRidge

br = BayesianRidge()
br.fit(X_train_pca, y_train)
y_pred = br.predict(X_train_pca)

  y = column_or_1d(y, warn=True)


In [140]:
# Evaluation of Bayesian Ridge Regression 
from sklearn.metrics import r2_score
score = r2_score(y_train, y_pred)

print('Score Bayesian Ridge Regression:', score)

Score Bayesian Ridge Regression: 0.484788449250907


## Gaussian Process

In [142]:
# Gaussian process
from sklearn.gaussian_process import GaussianProcessRegressor

gpr = GaussianProcessRegressor()
gpr.fit(X_train_pca, y_train)
y_pred = gpr.predict(X_train_pca)

In [143]:
# Evaluation of Gaussian Process
from sklearn.metrics import r2_score
score = r2_score(y_train, y_pred)

print('Score Gaussian Process:', score)

Score Gaussian Process: 1.0


# Prediction