In [1]:
# Load libraries
%matplotlib notebook
import numpy as np
import pandas as pd
from pandas import read_csv
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from linear_regression_gd import LinearRegressionWithGD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
# Load dataset
filename = 'fish.csv'
df  =  read_csv(filename)
df.rename({'Length1': 'Vert_len', 'Length2': 'Diag_len', 'Length3': 'Cross_len'}, axis='columns', inplace=True)

In [3]:
df.columns

Index(['Species', 'Weight', 'Vert_len', 'Diag_len', 'Cross_len', 'Height',
       'Width'],
      dtype='object')

In [4]:
# Peek at dataset
df.sample(5)

Unnamed: 0,Species,Weight,Vert_len,Diag_len,Cross_len,Height,Width
80,Perch,85.0,17.8,19.6,20.8,5.1376,3.0368
35,Roach,40.0,12.9,14.1,16.2,4.1472,2.268
152,Smelt,9.9,11.3,11.8,13.1,2.2139,1.1659
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02


In [5]:
# Let's delete the outliers
df.drop(index=[142, 143, 144], inplace=True)
df = df.reset_index(drop=True)

# Let's delete column 'Species'
df.drop(columns=['Species'], inplace=True)

In [6]:
# Split-out validation dataset and Test option and evaluation metric
X = df.values[:, 1:6]
Y = df['Weight'].values
seed = 7;  validation_size = 0.20;  scoring = 'r2'

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [7]:
# Standardize data
scaler_train = StandardScaler().fit(X_train);    scaler_validation = StandardScaler().fit(X_validation)
rescaledX = scaler_train.transform(X_train);    rescaledX_val = scaler_validation.transform(X_validation)

In [8]:
# Linear Regression
reg = LinearRegression()
reg.fit(X_train, Y_train)
predicts = reg.predict(X_validation)
print('LR : Score = ', r2_score(predicts, Y_validation))

LR : Score =  0.886615870459262


In [9]:
# Linear Regression with scale
reg = LinearRegression()
reg.fit(rescaledX, Y_train)
predicts = reg.predict(rescaledX_val)
r2_score(predicts, Y_validation)
print('LR : Score with Scale = ', r2_score(predicts, Y_validation))

LR : Score with Scale =  0.7833533815244007


In [10]:
# Linear Regression with gradient descent
lr_gd = LinearRegressionWithGD(eta=0.05)
w, cost, w_history = lr_gd.fit(X_train, Y_train, loss='r2')
predicts_gd = lr_gd.predict(X_validation)
print('GD : Score = ', r2_score(predicts_gd, Y_validation))
print('Min = {0} et Max = {1}' .format(cost.min(), cost.max()))
print('Poids = {0}' .format(w))

GD : Score =  -8.846670511267233
Min = -126.16897961567575 et Max = -1.577533528273578
Poids = [ -0.97993735 -29.74499889 -32.18678625 -35.40116072 -10.51146494
  -5.08860983]


In [12]:
def cost_sgd(x, y, w):
    cost = np.sum(np.square(predict)) / predict.shape[0]

    return   cost


class LinearRegressionSGD:
    
    def __init__(self, eta=0.1, n_iterations=700):
        self.eta = eta
        self.n_iterations = n_iterations

    def fit(self, x, y, scoring):
        
        self.cost_ = []
        self.w_ = np.zeros((x.shape[1]+1))
        x = np.hstack((np.ones((x.shape[0], 1)), x))
        n = x.shape[0]
        w_history  = np.zeros((self.n_iterations, x.shape[1]))

        for i in range(self.n_iterations):
            data = np.column_stack( (x, y) )
            np.random.shuffle(data)
            x = data[:, 0:x.shape[1]]
            y = (data[:, x.shape[1]:x.shape[1]+1]).reshape(x.shape[0],)
            
            gradient = np.zeros(shape = (x.shape[1]))
            cost_iter = np.zeros( (x.shape[0] * self.n_iterations) )
            resi = 0
            
            for _ in range(10):
                y_pred = np.dot(x[_], self.w_)
                residual = y_pred - y[_]
                gradient += ((2 / x.shape[0]) * (x[_] * residual))

            self.w_ = self.w_ - (self.eta * gradient)
            w_history[i] = self.w_.T
            
            predict = np.dot(x, self.w_)
            cost = np.sum((predict - y)**2) / x.shape[0]
            self.cost_.append(cost)
            
            
        return self.w_, np.array(self.cost_), w_history
                                       
    def predict(self, x):  
        x = np.hstack( (np.ones((x.shape[0], 1)), x) )
        return np.dot(x, self.w_)

In [13]:
s = np.array( [[0.72, 0.32],
               [0.75, 0.12],
               [0.53, 0.65],
               [0.27, 0.82],
               [0.49, 0.15],
               [0.02, 0.19],
               [0.35, 0.87],
               [0.99, 0.71],
               [0.98, 0.92],
               [0.73, 0.19]] )

t = np.array( [6.93, 5.99, 1.46, 1.44, 4.51, 1.25, 2.53, 6.88, 6.25, 6.36] )

a = np.array( [[2, 3],
               [4, 5],
               [6, 7]] )
b = np.array( [5, 9, 13] )

In [14]:
# Linear Regression with gradient descent
lr_gd = LinearRegressionWithGD(eta=0.005, n_iterations=3000)
w_, c, w_hist = lr_gd.fit(rescaledX, Y_train, loss='mean_squared_error')

In [15]:
c.min(), c.max()

(10178.150227572038102, 257411.83685483870968)

In [16]:
# Linear Regression with stochastic gradient descent
lr_sgd = LinearRegressionSGD(eta=0.00005, n_iterations=3000)
w1_, c1, w_hist1 = lr_sgd.fit(X_train, Y_train, scoring='neg_mean_squared_error')

In [21]:
plt.plot(np.arange(len(c1)), c1, color='red')
plt.plot(np.arange(len(c)), c, color = 'blue')
plt.xlabel('Number Iterations')
plt.ylabel('Cost')
plt.show()

<IPython.core.display.Javascript object>