In [161]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [162]:
# Load dataset
X = pd.read_csv('X_train.csv', index_col='id')
y = pd.read_csv('y_train.csv', index_col='id')
X_test_sub = pd.read_csv('X_test.csv', index_col='id')

In [163]:
print('Shape of training set:', X.shape)
print('Shape of test set:', X_test_sub.shape)

Shape of training set: (1212, 832)
Shape of test set: (776, 832)


# Preprocessing

### Imputation of missing values

In [164]:
# Simple imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_train_imp = imputer.fit_transform(X)
X_test_imp = imputer.transform(X_test_sub)

# Iterative imputer
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
#imputer = IterativeImputer(initial_strategy='median', n_nearest_features=100)
#X_train_imp = imputer.fit_transform(X)
#X_test_imp = imputer.transform(X_test_sub)

X_train_imp = pd.DataFrame(X_train_imp, columns=X.columns)
X_test_imp = pd.DataFrame(X_test_imp, columns=X_test_sub.columns)

### Standardization

In [165]:
# Standard scaler
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train_imp)
#X_test_scaled = scaler.transform(X_test_imp)

# Scaler robust to outliers
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_sub.columns)

### Feature selection

In [212]:
# Select k best
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
select = SelectKBest(score_func=f_regression, k=50)
X_train_select = select.fit_transform(X_train_scaled, np.array(y).ravel())
X_test_select = select.transform(X_test_scaled)

X_train_select = pd.DataFrame(X_train_select)
X_test_select = pd.DataFrame(X_test_select)

# Regression models

In [251]:
# Train-Test-Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_select, y, test_size=0.2) 

In [252]:
print('Shape of training set:', X_train.shape)
print('Shape of test set:', X_test.shape)

Shape of training set: (969, 50)
Shape of test set: (243, 50)


### Ridge Regression

In [253]:
# Ridge regression
from sklearn.linear_model import RidgeCV
rr = RidgeCV(cv=10).fit(X_train, y_train)
y_pred = rr.predict(X_test)

In [254]:
# Evaluation of Ridge Regression
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

print('Score Ridge Regression:', score)

Score Ridge Regression: 0.3378107626542979


### Gaussian process

In [255]:
# Gaussian process
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import *
kernel = RationalQuadratic() 
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, normalize_y=True)
gpr.fit(X_train, y_train)
y_pred = gpr.predict(X_test)

In [230]:
# Evaluation of Gaussian Process
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

print('Score Gaussian Process:', score)

Score Gaussian Process: 0.6032690681855764


# Submission

In [207]:
# Prediction on submission test set
submission = gpr.predict(X_test_select)
df_submission = pd.DataFrame({'id': X_test_sub.index, 'y': submission})

In [208]:
# Save into csv file
df_submission.to_csv('submission.csv',index=False)