# Lab2 - K-fold cross validation 

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error, r2_score

colab_environment = True

## Introduction

**Goal** : Deciding used features on regression model

### Processing flow
1. Read raw data
2. Data preprocess
3. Build model and validation
4. Evaluation


## Read raw data
1. read data
1. basic observation

In [None]:
# For colab environment
if colab_environment:
    from google.colab import drive
    drive.mount('/content/drive')
    csv_path = '/content/drive/MyDrive/ML_workshop/3_K-fold/CarPrice_Assignment.csv'
else:
    csv_path = 'CarPrice_Assignment.csv'

In [None]:
# read dataset
df_raw_data = pd.read_csv(csv_path)

In [None]:
# view column names
df_raw_data.keys()

## Data preprocess
### Selecting features that is used on prediction and prediction ground truth.

1. features combination 1
   - **symboling** : Its assigned insurance risk rating, A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.(Categorical)
   - **fueltype** : Car fuel type i.e gas or diesel (Categorical)
   - **fuelsystem** : Fuel system of car (Categorical)		
   - **horsepower** : Horsepower (Numeric)
   - **price** : Price of car (Numeric)  
&nbsp;
2. features combination 2
   - **symboling** : Its assigned insurance risk rating, A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.(Categorical)
   - **CarName** : Name of car company (Categorical)		
   - **fueltype** : Car fuel type i.e gas or diesel (Categorical)
   - **fuelsystem** : Fuel system of car (Categorical)		
   - **horsepower** : Horsepower (Numeric)
   - **price** : Price of car (Numeric)  
&nbsp;
2. features combination 3
   - **symboling** : Its assigned insurance risk rating, A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.(Categorical)
   - **fueltype** : Car fuel type i.e gas or diesel (Categorical)
   - **horsepower** : Horsepower (Numeric)
   - **price** : Price of car (Numeric)  

In [None]:
# select using columns
df_selected_data_1 = df_raw_data[['symboling', 'fueltype', 'fuelsystem', 'horsepower', 'price']]
print(f'There are {df_selected_data_1.shape[0]} pieces of data.\n')

df_selected_data_1.head()

In [None]:
# select using columns
df_selected_data_2 = df_raw_data[['symboling', 'CarName', 'fueltype', 'fuelsystem', 'horsepower', 'price']]
print(f'There are {df_selected_data_2.shape[0]} pieces of data.\n')

df_selected_data_2.head()

In [None]:
# select using columns
df_selected_data_3 = df_raw_data[['symboling', 'fueltype', 'horsepower', 'price']]
print(f'There are {df_selected_data_3.shape[0]} pieces of data.\n')

df_selected_data_3.head()

### process missing data
1. check nan value
2. fill a cell manually
2. fillna by column
3. dropna

In [None]:
# check missing data
df_selected_data_1.isnull().sum()

In [None]:
# check missing data
df_selected_data_2.isnull().sum()

In [None]:
# check missing data
df_selected_data_3.isnull().sum()

### Other preprocess

In [None]:
CarName_values = df_raw_data['CarName'].value_counts().keys()
fueltype_values = df_raw_data['fueltype'].value_counts().keys()
fuelsystem_values = df_raw_data['fuelsystem'].value_counts().keys()

In [None]:
def preprocess_1(df_selected_data):
    encoding = []

    # symboling
    symboling = df_selected_data['symboling'].to_numpy()
    encoding.append(symboling)

    # fueltype
    fueltype = df_selected_data['fueltype'].to_numpy()
    encoding.append(1 * (fueltype == fueltype_values[0]))
    
    # fuelsystem
    fuelsystem = df_selected_data['fuelsystem'].to_numpy()
    for fuelsystem_value in fuelsystem_values:
        encoding.append(fuelsystem == fuelsystem_value)

    # horsepower
    horsepower = df_selected_data['horsepower'].to_numpy()
    encoding.append(horsepower)

    # price
    price = df_selected_data['price'].to_numpy()
    encoding.append(price)

    processed_data = np.stack(encoding, axis=1)

    return processed_data

In [None]:
def preprocess_2(df_selected_data):
    encoding = []

    # symboling
    symboling = df_selected_data['symboling'].to_numpy()
    encoding.append(symboling)

    # CarName
    CarName = df_selected_data['CarName'].to_numpy()
    for CarName_value in CarName_values:
        encoding.append(CarName == CarName_value)

    # fueltype
    fueltype = df_selected_data['fueltype'].to_numpy()
    encoding.append(1 * (fueltype == fueltype_values[0]))
    
    # fuelsystem
    fuelsystem = df_selected_data['fuelsystem'].to_numpy()
    for fuelsystem_value in fuelsystem_values:
        encoding.append(fuelsystem == fuelsystem_value)

    # horsepower
    horsepower = df_selected_data['horsepower'].to_numpy()
    encoding.append(horsepower)

    # price
    price = df_selected_data['price'].to_numpy()
    encoding.append(price)

    processed_data = np.stack(encoding, axis=1)

    return processed_data

In [None]:
def preprocess_3(df_selected_data):
    encoding = []

    # symboling
    symboling = df_selected_data['symboling'].to_numpy()
    encoding.append(symboling)

    # fueltype
    fueltype = df_selected_data['fueltype'].to_numpy()
    encoding.append(1 * (fueltype == fueltype_values[0]))
    
    # horsepower
    horsepower = df_selected_data['horsepower'].to_numpy()
    encoding.append(horsepower)

    # price
    price = df_selected_data['price'].to_numpy()
    encoding.append(price)

    processed_data = np.stack(encoding, axis=1)

    return processed_data

In [None]:
# combination 1
preprocessed_data_1 = preprocess_1(df_selected_data_1)
# combination 2
preprocessed_data_2 = preprocess_2(df_selected_data_2)
# combination 3
preprocessed_data_3 = preprocess_3(df_selected_data_3)

## Build model and validation

1. build model and validation - feature combination 1
2. build model and validation - feature combination 2
3. build model and validation - feature combination 3
4. comparing performance - use mean squared error as performance index

### build model and validation - feature combination 1

In [None]:
# split dataset into test and non-test set
X_1 = preprocessed_data_1[:, 0:-1]
Y_1 = preprocessed_data_1[:, -1]

X_non_test_1, X_test_1, Y_non_test_1, Y_test_1 = train_test_split(X_1, Y_1, test_size=0.2, random_state=42)

In [None]:
# K-fold and validation
validation_1 = []
kf_1 = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index_1, validation_index_1 in kf_1.split(X_non_test_1):
    X_train_1, X_validation_1 = X_non_test_1[train_index_1], X_non_test_1[validation_index_1]
    Y_train_1, Y_validation_1 = Y_non_test_1[train_index_1], Y_non_test_1[validation_index_1]
    # train
    model_1 = LinearRegression()
    model_1.fit(X_train_1, Y_train_1)
    # validation
    Y_pred_1 = model_1.predict(X_validation_1)
    validation_1.append(mean_squared_error(Y_validation_1, Y_pred_1))

### build model and validation - feature combination 2

In [None]:
# split dataset into test and non-test set
X_2 = preprocessed_data_2[:, 0:-1]
Y_2 = preprocessed_data_2[:, -1]

X_non_test_2, X_test_2, Y_non_test_2, Y_test_2 = train_test_split(X_2, Y_2, test_size=0.2, random_state=42)

In [None]:
# K-fold and validation
validation_2 = []
kf_2 = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index_2, validation_index_2 in kf_2.split(X_non_test_2):
    X_train_2, X_validation_2 = X_non_test_2[train_index_2], X_non_test_2[validation_index_2]
    Y_train_2, Y_validation_2 = Y_non_test_2[train_index_2], Y_non_test_2[validation_index_2]
    # train
    model_2 = LinearRegression()
    model_2.fit(X_train_2, Y_train_2)
    # validation
    Y_pred_2 = model_2.predict(X_validation_2)
    validation_2.append(mean_squared_error(Y_validation_2, Y_pred_2))

### build model and validation - feature combination 3

In [None]:
# split dataset into test and non-test set
X_3 = preprocessed_data_3[:, 0:-1]
Y_3 = preprocessed_data_3[:, -1]

X_non_test_3, X_test_3, Y_non_test_3, Y_test_3 = train_test_split(X_3, Y_3, test_size=0.2, random_state=42)

In [None]:
# K-fold and validation
validation_3 = []
kf_3 = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index_3, validation_index_3 in kf_3.split(X_non_test_3):
    X_train_3, X_validation_3 = X_non_test_3[train_index_3], X_non_test_3[validation_index_3]
    Y_train_3, Y_validation_3 = Y_non_test_3[train_index_3], Y_non_test_3[validation_index_3]
    # train
    model_3 = LinearRegression()
    model_3.fit(X_train_3, Y_train_3)
    # validation
    Y_pred_3 = model_3.predict(X_validation_3)
    validation_3.append(mean_squared_error(Y_validation_3, Y_pred_3))

### comparing performance

In [None]:
# print mean value
print(f'Average performance of combination 1 : {sum(validation_1) / len(validation_1)}')
print(f'Average performance of combination 2 : {sum(validation_2) / len(validation_2):f}')
print(f'Average performance of combination 3 : {sum(validation_3) / len(validation_3):f}')

### Evaluation
1. train a new model with non-test dataset
2. evaluate test data with test dataset
   - max error
   - mean absolute error
   - root mean squared error
   - r2 score (coefficient of determination)
   - [Other metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)

### train a new model with non-test dataset

In [None]:
model = LinearRegression()
model.fit(X_non_test_1, Y_non_test_1)

### evaluate test data with test dataset

In [None]:
# make prediction
Y_pred_1 = model.predict(X_test_1)
# make prediction
# max error
print(f'Max error : {max_error(Y_test_1,Y_pred_1)}')

# mean absolute error
print(f'Mean absolute error : {mean_absolute_error(Y_test_1,Y_pred_1)}')

# mean root mean error
print(f'Root mean squared error : {mean_squared_error(Y_test_1,Y_pred_1, squared=False)}')

# r2 score
print(f'R2 score (Coefficient of determination) : {r2_score(Y_test_1,Y_pred_1)}')