In [1]:
import os, time, pickle

import numpy as np
np.random.seed(42)

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Reading the data
folderName = '../data/' 
fileName   = 'yacht_hydrodynamics.data'

#Attribute Information:

# Variations concern hull geometry coefficients and the Froude number: 

# 1. Longitudinal position of the center of buoyancy, adimensional. 
# 2. Prismatic coefficient, adimensional. 
# 3. Length-displacement ratio, adimensional. 
# 4. Beam-draught ratio, adimensional. 
# 5. Length-beam ratio, adimensional. 
# 6. Froude number, adimensional. 

# The measured variable is the residuary resistance per unit weight of displacement: 

# 7. Residuary resistance per unit weight of displacement, adimensional. 

header = ['V_{}'.format(i) for i in range(7)]
df = pd.read_csv(folderName+fileName,sep='\s+',names=header,index_col=None)#.reset_index()

# Printing some info about the data
print("[INFO]\nNrow: {}\nNcol: {}".format(df.shape[0],df.shape[1]))
df.head(3)

[INFO]
Nrow: 308
Ncol: 7


Unnamed: 0,V_0,V_1,V_2,V_3,V_4,V_5,V_6
0,-2.3,0.568,4.78,3.99,3.17,0.125,0.11
1,-2.3,0.568,4.78,3.99,3.17,0.15,0.27
2,-2.3,0.568,4.78,3.99,3.17,0.175,0.47


In [3]:
# Setting the target column
target = df.loc[:,df.columns[-1]].values

# If necessary, dropping cols
cols = df.columns
df.drop(labels=cols[-1],axis=1,inplace=True)
df.head(3)

Unnamed: 0,V_0,V_1,V_2,V_3,V_4,V_5
0,-2.3,0.568,4.78,3.99,3.17,0.125
1,-2.3,0.568,4.78,3.99,3.17,0.15
2,-2.3,0.568,4.78,3.99,3.17,0.175


In [5]:
from sklearn.linear_model import ElasticNet
from sklearn import model_selection
from sklearn import metrics

In [6]:
clf = ElasticNet(alpha=0.3, 
                 l1_ratio=0.5)
#             fit_intercept=True, 
#             normalize=False, 
#             precompute=False, 
#             max_iter=1000, 
#             copy_X=True, 
#             tol=0.0001, 
#             warm_start=False, 
#             positive=False, 
#             selection='cyclic')

## Leave-One-Out Cross Validation

In [7]:
mae_train=[]
mse_train=[]
rmse_train=[]
r2_train=[]

mae_test=[]
mse_test=[]
rmse_test=[]
mse_sum = 0
r2_test=[]
# ----------- CROSS VALIDATION ----------- #
# -------------- LeaveOneOut ----------- #
cv = model_selection.LeaveOneOut()
for train_index, test_index in cv.split(df.values):
    X_train, X_test = df.values[train_index,:], df.values[test_index,:]
    y_train, y_test = target[train_index],target[test_index]
    
    # Fitting the data into the model
    clf.fit(X_train, y_train)
    
    # FOR TRAINING
    predicted_train = clf.predict(X_train)
    mae_train.append(metrics.mean_absolute_error(y_train, predicted_train))
    mse_train.append(metrics.mean_squared_error(y_train, predicted_train))
    rmse_train.append(np.sqrt(metrics.mean_squared_error(y_train, predicted_train)))
    r2_train.append(metrics.r2_score(y_train, predicted_train))
    
    # FOR TESTING
    predicted_test = clf.predict(X_test)
    mae_test.append(metrics.mean_absolute_error(y_test, predicted_test))
    mse_test.append(metrics.mean_squared_error(y_test, predicted_test))
    mse = metrics.mean_squared_error(y_test, predicted_test)
    mse_sum = mse_sum + mse
    r2_test.append(metrics.r2_score(y_test, predicted_test))


rmse_test = np.sqrt(mse_sum/len(rmse_train))


$$MAE = \sum_{i=0}^{n} (y_i - \hat{y_i})$$

$$RMSE = \sqrt{\frac{\sum_{i=0}^{n} (y_i - \hat{y_i})^2}{n}}$$

$$RMSE = \sqrt{\frac{\sum_{i=0}^{n} (MAE_i)^2}{n}}$$

In [8]:
# Calculating the mean values for train and test scores
mae_train = np.array(mae_train).mean()
mse_train = np.array(mse_train).mean()
rmse_train = np.array(rmse_train).mean()
r2_train = np.array(r2_train).mean()

mae_test = np.array(mae_test).mean()
mse_test = np.array(mse_test).mean()
r2_test = np.array(r2_test).mean()

# Displaying the results
print('MAE Train = {}'.format(mae_train))
print('MSE Train = {}'.format(mse_train))
print('RMSE Train = {}'.format(rmse_train))
print('R2 Train = {}'.format(r2_train))
print()
print('MAE Test = {}'.format(mae_test))
print('MSE Test = {}'.format(mse_test))
print('RMSE Test = {}'.format(rmse_test))
print('R2 Test = {}'.format(r2_test))

MAE Train = 10.980027010209332
MSE Train = 212.72910220083665
RMSE Train = 14.585161709093894
R2 Train = 0.07142295145957663

MAE Test = 11.054240034655841
MSE Test = 215.7621396438249
RMSE Test = 14.688844054037231
R2 Test = 0.0


## K-Fold Cross Validation

In [9]:
mae_train=[]
mse_train=[]
rmse_train=[]
r2_train=[]

mae_test=[]
mse_test=[]
rmse_test=[]
r2_test=[]
# ----------- CROSS VALIDATION ----------- #
# -------------- LeaveOneOut ----------- #
cv = model_selection.KFold(n_splits=5,shuffle=False)
for train_index, test_index in cv.split(df.values):
    X_train, X_test = df.values[train_index,:], df.values[test_index,:]
    y_train, y_test = target[train_index],target[test_index]
    
    # Fitting the data into the model
    clf.fit(X_train, y_train)
    
    # FOR TRAINING
    predicted_train = clf.predict(X_train)
    mae_train.append(metrics.mean_absolute_error(y_train, predicted_train))
    mse_train.append(metrics.mean_squared_error(y_train, predicted_train))
    rmse_train.append(np.sqrt(metrics.mean_squared_error(y_train, predicted_train)))
    r2_train.append(metrics.r2_score(y_train, predicted_train))
    
    # FOR TESTING
    predicted_test = clf.predict(X_test)
    mae_test.append(metrics.mean_absolute_error(y_test, predicted_test))
    mse_test.append(metrics.mean_squared_error(y_test, predicted_test))
    rmse_test.append(np.sqrt(metrics.mean_squared_error(y_test, predicted_test)))
    r2_test.append(metrics.r2_score(y_test, predicted_test))

In [10]:
# Calculating the mean values for train and test scores
mae_train = np.array(mae_train).mean()
mse_train = np.array(mse_train).mean()
rmse_train = np.array(rmse_train).mean()
r2_train = np.array(r2_train).mean()

mae_test = np.array(mae_test).mean()
mse_test = np.array(mse_test).mean()
rmse_test = np.array(rmse_test).mean()
r2_test = np.array(r2_test).mean()

# Displaying the results
print('MAE   = {}'.format(mae_train))
print('MSE Train = {}'.format(mse_train))
print('RMSE Train = {}'.format(rmse_train))
print('R2 Train = {}'.format(r2_train))
print()
print('MAE Test = {}'.format(mae_test))
print('MSE Test = {}'.format(mse_test))
print('RMSE Test = {}'.format(rmse_test))
print('R2 Test = {}'.format(r2_test))

MAE   = 10.972467982999563
MSE Train = 212.57718157411736
RMSE Train = 14.578093497712004
R2 Train = 0.07190917637466616

MAE Test = 11.005140945750645
MSE Test = 214.1770222511947
RMSE Test = 14.597742384807967
R2 Test = 0.06315165107158802


## Hold-out Cross Validation

In [11]:
mae_train=[]
mse_train=[]
rmse_train=[]
r2_train=[]

mae_test=[]
mse_test=[]
rmse_test=[]
r2_test=[]
# ----------- CROSS VALIDATION ----------- #
# -------------- Hold-Out -------------- # When I have too many data
X_train, X_test, y_train, y_test  = model_selection.train_test_split(df.values, 
                                                                     target, 
                                                                     test_size=0.5,
                                                                     random_state=None,
                                                                     shuffle=False)
    
# Fitting the data into the model
clf.fit(X_train, y_train)

# FOR TRAINING
predicted_train = clf.predict(X_train)
mae_train.append(metrics.mean_absolute_error(y_train, predicted_train))
mse_train.append(metrics.mean_squared_error(y_train, predicted_train))
rmse_train.append(np.sqrt(metrics.mean_squared_error(y_train, predicted_train)))
r2_train.append(metrics.r2_score(y_train, predicted_train))

# FOR TESTING
predicted_test = clf.predict(X_test)
mae_test.append(metrics.mean_absolute_error(y_test, predicted_test))
mse_test.append(metrics.mean_squared_error(y_test, predicted_test))
rmse_test.append(np.sqrt(metrics.mean_squared_error(y_test, predicted_test)))
r2_test.append(metrics.r2_score(y_test, predicted_test))

In [12]:
# Calculating the mean values for train and test scores
mae_train = np.array(mae_train).mean()
mse_train = np.array(mse_train).mean()
rmse_train = np.array(rmse_train).mean()
r2_train = np.array(r2_train).mean()

mae_test = np.array(mae_test).mean()
mse_test = np.array(mse_test).mean()
rmse_test = np.array(rmse_test).mean()
r2_test = np.array(r2_test).mean()

# Displaying the results
print('MAE   = {}'.format(mae_train))
print('MSE Train = {}'.format(mse_train))
print('RMSE Train = {}'.format(rmse_train))
print('R2 Train = {}'.format(r2_train))
print()
print('MAE Test = {}'.format(mae_test))
print('MSE Test = {}'.format(mse_test))
print('RMSE Test = {}'.format(rmse_test))
print('R2 Test = {}'.format(r2_test))

MAE   = 10.63072238562831
MSE Train = 199.63146756318014
RMSE Train = 14.129100026653507
R2 Train = 0.07103591520822061

MAE Test = 11.141949558074572
MSE Test = 227.12796858139342
RMSE Test = 15.070765361500172
R2 Test = 0.06593090823364078
