In [547]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [548]:
col_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'] 
df = pd.read_csv('Data/Abalone/abalone.data', names=col_names)

In [549]:
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [550]:
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [551]:
# one hot non ordinal categorical variables
one_hot_s = pd.get_dummies(df['Sex'], prefix='Sex')
df.drop('Sex', axis=1, inplace=True)
df = df.join(one_hot_s)

In [552]:
feature_cols = list(set(df.columns) - set(['Rings']))
out_col = 'Rings'

In [553]:
def split(df: pd.DataFrame, folds=5):
    df = df.sample(frac=1, random_state=7).reset_index(drop=True)
    num = len(df) // folds
    left = len(df) % folds
    df_folds = []
    prev = 0
    for i in range(folds):
        length = num + (left > 0)
        df_ = df.iloc[prev: prev + length].copy().reset_index(drop=True)
        df_folds.append(df_)
        prev += length
        left -= 1
    return df_folds

def form_train_val(folds, val_fold):
    train_folds = [folds[i] for i in range(len(folds)) if i != val_fold]
    train_df = pd.concat(train_folds).reset_index(drop=True)
    val_df = folds[val_fold].copy()
    return train_df, val_df

def MSE(y_actual, y_pred, inbuilt=False):
    if inbuilt:
        return metrics.mean_squared_error(y_actual, y_pred)
    else:
        assert (len(y_actual) == len(y_pred))
        diff = (y_actual - y_pred) ** 2
        return diff.sum() / len(y_actual)

In [554]:
folds = split(df)

In [555]:
class Regression:
    """
    Linear Regression
    """
    def __init__(self, normalise):
        self.normalise = normalise
        
    def fit(self, X: pd.DataFrame, y: pd.Series):
        if self.normalise:
            self.mean = X.mean()
            self.std = X.std()
            X = (X - self.mean) / self.std
        lr = LinearRegression()
        lr.fit(X, y)    
        self.w = lr.coef_
        self.b = lr.intercept_

    def predict(self, X_test: pd.DataFrame):
        if self.normalise:
            X_test = (X_test - self.mean) / self.std
        y_pred = (X_test * self.w).sum(axis=1) + self.b
        return np.array(y_pred)

In [556]:
mse_df = {'Validation fold': [], 'Train MSE': [], 'Validation MSE': []}
mse_inbuilt_df = {'Validation fold': [], 'Train MSE': [], 'Validation MSE': []}
for val_fold in range(len(folds)):
    train_df, val_df = form_train_val(folds, val_fold)
    lr = Regression(normalise=True)
    lr.fit(train_df[feature_cols], train_df[out_col])
    
    train_mse = MSE(train_df[out_col], lr.predict(train_df[feature_cols]))
    val_mse = MSE(val_df[out_col], lr.predict(val_df[feature_cols]))
    train_mse_inbuilt = MSE(train_df[out_col], lr.predict(train_df[feature_cols]), inbuilt=True)
    val_mse_inbuilt = MSE(val_df[out_col], lr.predict(val_df[feature_cols]), inbuilt=True)
    
    mse_df['Validation fold'].append(val_fold)
    mse_df['Train MSE'].append(train_mse)
    mse_df['Validation MSE'].append(val_mse)
    mse_inbuilt_df['Validation fold'].append(val_fold)
    mse_inbuilt_df['Train MSE'].append(train_mse_inbuilt)
    mse_inbuilt_df['Validation MSE'].append(val_mse_inbuilt)
mse_df = pd.DataFrame(mse_df)
mse_inbuilt_df = pd.DataFrame(mse_inbuilt_df)

In [557]:
mse_df

Unnamed: 0,Validation fold,Train MSE,Validation MSE
0,0,4.733024,5.127119
1,1,4.795933,4.84473
2,2,4.712582,5.505303
3,3,4.870471,4.549547
4,4,4.84351,4.684619


In [558]:
mse_inbuilt_df

Unnamed: 0,Validation fold,Train MSE,Validation MSE
0,0,4.733024,5.127119
1,1,4.795933,4.84473
2,2,4.712582,5.505303
3,3,4.870471,4.549547
4,4,4.84351,4.684619


In [559]:
mse_df.mean()[['Train MSE', 'Validation MSE']]

Train MSE         4.791104
Validation MSE    4.942263
dtype: float64

In [560]:
class NormalEq:
    """
    Linear Regression using normal equations
    """
    def __init__(self, normalise):
        self.normalise = normalise
        
    def fit(self, X_df: pd.DataFrame, y_df: pd.Series):
        if self.normalise:
            self.mean = X_df.mean()
            self.std = X_df.std()
            X_df = (X_df - self.mean) / self.std
        else:
            X_df = X_df.copy()
        X_df['_bias'] = np.ones(len(X_df))
        X = X_df.to_numpy()
        y = np.array(y_df)
        self.w = np.linalg.pinv(X.T @ X) @ (X.T @ y)
    
    def predict(self, X_test_df: pd.DataFrame):
        if self.normalise:
            X_test_df = (X_test_df - self.mean) / self.std
        else:
            X_test_df = X_test_df.copy()    
        X_test_df['_bias'] = np.ones(len(X_test_df))
        X_test = X_test_df.to_numpy()
        return X_test @ self.w

In [561]:
mse_ne_df = {'Validation fold': [], 'Train MSE': [], 'Validation MSE': []}
for val_fold in range(len(folds)):
    train_df, val_df = form_train_val(folds, val_fold)
    lr_ne = NormalEq(normalise=True)
    lr_ne.fit(train_df[feature_cols], train_df[out_col])
    
    train_mse = MSE(train_df[out_col], lr_ne.predict(train_df[feature_cols]))
    val_mse = MSE(val_df[out_col], lr_ne.predict(val_df[feature_cols]))
    
    mse_ne_df['Validation fold'].append(val_fold)
    mse_ne_df['Train MSE'].append(train_mse)
    mse_ne_df['Validation MSE'].append(val_mse)
mse_ne_df = pd.DataFrame(mse_ne_df)

In [562]:
mse_ne_df

Unnamed: 0,Validation fold,Train MSE,Validation MSE
0,0,4.733024,5.127119
1,1,4.795933,4.84473
2,2,4.712582,5.505303
3,3,4.870471,4.549547
4,4,4.84351,4.684619


In [563]:
mse_ne_df.mean()[['Train MSE', 'Validation MSE']]

Train MSE         4.791104
Validation MSE    4.942263
dtype: float64

In [564]:
mse_sklearn_df = {'Validation fold': [], 'Train MSE': [], 'Validation MSE': []}
for val_fold in range(len(folds)):
    train_df, val_df = form_train_val(folds, val_fold)
    # explicit normalisation
    mean_train_df = train_df[feature_cols].mean()
    std_train_df = train_df[feature_cols].std()
    train_df[feature_cols] = (train_df[feature_cols] - mean_train_df) / std_train_df
    val_df[feature_cols] = (val_df[feature_cols] - mean_train_df) / std_train_df

    lr_sklearn = LinearRegression()
    lr_sklearn.fit(train_df[feature_cols], train_df[out_col])
    
    train_mse = MSE(train_df[out_col], lr_sklearn.predict(train_df[feature_cols]), inbuilt=True)
    val_mse = MSE(val_df[out_col], lr_sklearn.predict(val_df[feature_cols]), inbuilt=True)
    
    mse_sklearn_df['Validation fold'].append(val_fold)
    mse_sklearn_df['Train MSE'].append(train_mse)
    mse_sklearn_df['Validation MSE'].append(val_mse)
mse_sklearn_df = pd.DataFrame(mse_sklearn_df)

In [565]:
mse_sklearn_df

Unnamed: 0,Validation fold,Train MSE,Validation MSE
0,0,4.733024,5.127119
1,1,4.795933,4.84473
2,2,4.712582,5.505303
3,3,4.870471,4.549547
4,4,4.84351,4.684619


In [566]:
mse_sklearn_df.mean()[['Train MSE', 'Validation MSE']]

Train MSE         4.791104
Validation MSE    4.942263
dtype: float64