In [567]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [568]:
# Load dataset
X = pd.read_csv('X_train.csv', index_col='id')
y = pd.read_csv('y_train.csv', index_col='id')
X_test_sub = pd.read_csv('X_test.csv', index_col='id')

In [569]:
print('Shape of training set:', X.shape)
print('Shape of test set:', X_test_sub.shape)

Shape of training set: (1212, 832)
Shape of test set: (776, 832)


# Preprocessing

### Imputation of missing values

In [570]:
# Simple imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_train_imp = imputer.fit_transform(X)
X_test_imp = imputer.transform(X_test_sub)

# Iterative imputer
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
#imputer = IterativeImputer(initial_strategy='median')
#X_train_imp = imputer.fit_transform(X_train)
#X_test_imp = imputer.transform(X_test)

X_train_imp = pd.DataFrame(X_train_imp, columns=X.columns)
X_test_imp = pd.DataFrame(X_test_imp, columns=X_test_sub.columns)

### Standardization

In [571]:
# Standard scaler
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train_imp)
#X_test_scaled = scaler.transform(X_test_imp)

# Scaler robust to outliers
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_sub.columns)

### Feature selection

In [572]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_pca = pd.DataFrame(X_train_pca)
X_test_pca = pd.DataFrame(X_test_pca)

# Regression models

In [573]:
# Train-Test-Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_pca, y, test_size=0.2) 

In [574]:
print('Shape of training set:', X_train.shape)
print('Shape of test set:', X_test.shape)

Shape of training set: (969, 100)
Shape of test set: (243, 100)


### Linear regression

In [575]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [576]:
# Evaluation of Linear Regression
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

print('Score Linear Regression:', score)

Score Linear Regression: 0.2620866557934147


### Ridge Regression

In [577]:
# Ridge regression
from sklearn.linear_model import RidgeCV
rr = RidgeCV(cv=10).fit(X_train, y_train)
y_pred = rr.predict(X_test)

In [578]:
# Evaluation of Ridge Regression
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

print('Score Ridge Regression:', score)

Score Ridge Regression: 0.26279574347516144


### Gaussian process

In [579]:
# Gaussian process
from sklearn.gaussian_process import GaussianProcessRegressor

gpr = GaussianProcessRegressor()
gpr.fit(X_train, y_train)
y_pred = gpr.predict(X_test)

In [580]:
# Evaluation of Gaussian Process
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

print('Score Gaussian Process:', score)

Score Gaussian Process: -49.67533264318197


# Submission