# Project 2

#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
# Add any other imports you need here
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic

#### Data Loading

In [2]:
"""
This loads the training and test data, preprocesses it, removes the NaN
values and interpolates the missing data using imputation

Parameters
----------
Compute
----------
X_train: matrix of floats, training input with features
y_train: array of floats, training output with labels
X_test: matrix of floats: dim = (100, ?), test input with features
"""
# Load training data
train_df = pd.read_csv("train.csv")
    
print("Training data:")
print("Shape:", train_df.shape)
print(train_df.head(2))
print('\n')
    
# Load test data
test_df = pd.read_csv("test.csv")

print("Test data:")
print(test_df.shape)
print(test_df.head(2))

###################################################################################

data_train = train_df.drop(columns=["season"])
X_test = test_df.drop(columns=["season"])

# KNN Imputation
imputer = KNNImputer(n_neighbors=5, weights="uniform")
data_train = imputer.fit_transform(data_train)
X_train = np.delete(data_train, 1, axis=1)
y_train = data_train[:,1]
X_test = imputer.fit_transform(X_test)

# One-Hot Encoding of season feature
enc = preprocessing.OneHotEncoder(sparse_output=False)
seasons_train = train_df["season"].to_numpy()
seasons_test = test_df["season"].to_numpy()
enc.fit(seasons_train.reshape(-1,1))

X_train = np.hstack((enc.transform(seasons_train.reshape(-1,1)), X_train))
X_test = np.hstack((enc.transform(seasons_test.reshape(-1,1)), X_test))

###################################################################################

assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


#### Modeling and Prediction

In [3]:
"""
This defines the model, fits training data and then does the prediction
with the test data 

Parameters
----------
X_train: matrix of floats, training input with 10 features
y_train: array of floats, training output
X_test: matrix of floats: dim = (100, ?), test input with 10 features

Compute
----------
y_test: array of floats: dim = (100,), predictions on test set
"""

y_pred = np.zeros(X_test.shape[0])

###################################################################################

# Choose kernel using k-fold CV
kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic()]
n_folds = 5
R2_mat = np.zeros((n_folds, len(kernels)))
kf = KFold(n_splits=n_folds)
j = 0
for kernel in kernels:
    i = 0
    for train, test in kf.split(X_train):
        x_cv, x_valid, y_cv, y_valid = X_train[train], X_train[test], y_train[train], y_train[test]
        gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer = 3, random_state=42)
        gpr.fit(x_cv, y_cv)
        R2_mat[i, j] = gpr.score(x_valid, y_valid)
        i += 1
    j += 1
avg_R2 = np.mean(R2_mat, axis=0)
print(avg_R2)

# Choose RationalQuadratic() - highest R^2 = 0.9653
gpr = GaussianProcessRegressor(kernel=RationalQuadratic(), n_restarts_optimizer = 3, random_state=42)
gpr.fit(X_train, y_train)
print("R^2:", gpr.score(X_train, y_train))
y_pred = gpr.predict(X_test)

###################################################################################

assert y_pred.shape == (100,), "Invalid data shape"

[0.86986092 0.91561853 0.96141301 0.96532645]
R^2: 1.0


#### Saving Results

In [4]:
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)
print("\nResults file successfully generated!")


Results file successfully generated!
