# Sprint 3

## Machine Learning - Scratch Linear Regression

### [Problem 1] Hypothetical function

In [3]:
import numpy as np
import pandas as pd

In [69]:
class ScratchLinearRegression():
    """
    Scratch implementation of linear regression
    
    Parameters
    ----------
    num_iter : int
      Number of iterations
    lr : float
      Learning rate
    no_bias : bool
      True if no bias term is included
    verbose : bool
      True to output the learning process
    
    Attributes
    ----------
    self.coef_ : The following form of ndarray, shape (n_features,)
      Parameters
    self.loss : The following form of ndarray, shape (self.iter,)
      Record losses on training data
    self.val_loss : The following form of ndarray, shape (self.iter,)
      Record loss on validation data
    """
    
    def __init__(self, num_iter, lr, no_bias, verbose):
        # Record hyperparameters as attributes
        self.iter = num_iter
        self.lr = lr
        self.no_bias = no_bias
        self.verbose = verbose
        # Prepare an array to record the loss
        self.loss = np.zeros(self.iter)
        self.val_loss = np.zeros(self.iter)
        
    def fit(self, X, y, X_val=None, y_val=None):
        """
        Learn linear regression. If validation data is entered, the loss and accuracy for it are also calculated for each iteration.
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            Features of training data
        y : The following form of ndarray, shape (n_samples,)
            Correct answer value of training data
        X_val : The following forms of ndarray, shape (n_samples, n_features)
            Features of verification data
        y_val : The following form of ndarray, shape (n_samples,)
            Correct value of verification data
        """
        n_features = X.shape[1]
        # Prepare theta for hypotheis function
        self.theta = np.random.rand(n_features)
        for i in range(self.iter):
            y_hyp = self._linear_hypothesis(X)
            self.theta = self._gradient_descent(X, y, y_hyp)
            self.loss[i] = self._loss_function(y_hyp, y)
            if X_val is not None and y_val is not None:
                y_predict = self.predict(X_val)
                self.val_loss[i] = self._loss_function(y_predict, y_val)
        if self.verbose:
            #Output learning process when verbose is set to True
            print("Loss: {}".format(self.loss))
            print("Val_loss: {}".format(self.val_loss))
        pass
        
    def predict(self, X):
        """
        Estimate using linear regression.
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
            sample
        Returns
        -------
            The following form of ndarray, shape (n_samples, 1)
            Estimated result by linear regression
        """
        return X @ self.theta
    
    def _linear_hypothesis(self, X):
        """
        Compute a linear hypothetical function
        Parameters
        ----------
        X : The following forms of ndarray, shape (n_samples, n_features)
          Training data
        Returns
        -------
          The following form of ndarray, shape (n_samples, 1)
          Estimated result by linear hypothetical function
        """
        y_hyp = np.dot(X, self.theta.T)
        return y_hyp
    
    def _gradient_descent(self, X, y, y_hyp):
        """
        Update new theta j in the steepest decent method
        Parameters
        ----------
        X: The following forms of ndarray, shape (n_samples, n_features)
            Training data
        y: The following form of ndarray, shape (n_samples,)
            Correct answer value of training data
        Returns
        -------
        
        """
        n_samples = X.shape[0]
        update_theta = self.theta - (self.lr * np.dot((y_hyp - y), X) / n_samples)
        return update_theta
    
    def _loss_function(self, y_pred, y):
        """
        Calculation of objective function.
        Parameters
        ----------
        y_pred : The following forms of ndarray, shape (n_samples,)
          Estimated value
        y : The following forms of ndarray, shape (n_samples,)
          Correct answer value
        Returns
        ----------
        loss: numpy.float
          Result of the loss function
        """
        loss = np.mean((y_pred - y) ** 2) / 2
        return loss
        

# Outside the class
def MSE(y_pred, y):
    """
    Calculation of mean square error
    Parameters
    ----------
    y_pred : The following forms of ndarray, shape (n_samples,)
      Estimated value
    y : The following forms of ndarray, shape (n_samples,)
      Correct answer value
    Returns
    ----------
    mse : numpy.float
      Mean squared error
    """
    n_samples = y.shape[0]
    mse = np.mean((y_pred - y) ** 2)
    return mse

In [70]:
X = np.array([
    [1, 2],
    [3, 5],
    [5, 1],
    [7, 4],
    [0, 2],
    [2, 0],
    [4, 3]
])
X.shape

(7, 2)

In [71]:
# Let y = 2x1 + 1x2
y = np.array([4, 11, 11, 18, 2, 4, 11])
y.shape

(7,)

In [72]:
slr = ScratchLinearRegression(100, 0.01, True, True)

In [73]:
theta = slr.fit(X, y)

Loss: [2.08364014e+01 1.30597286e+01 8.21092561e+00 5.18653992e+00
 3.29903665e+00 2.12002702e+00 1.38258878e+00 9.20404920e-01
 6.29842786e-01 4.46326747e-01 3.29616655e-01 2.54635297e-01
 2.05753245e-01 1.73227904e-01 1.50984779e-01 1.35235045e-01
 1.23614185e-01 1.14645291e-01 1.07404658e-01 1.01313422e-01
 9.60077215e-02 9.12577987e-02 8.69175852e-02 8.28932847e-02
 7.91237928e-02 7.55684910e-02 7.21996370e-02 6.89976182e-02
 6.59479894e-02 6.30396230e-02 6.02635517e-02 5.76122437e-02
 5.50791469e-02 5.26584014e-02 5.03446566e-02 4.81329539e-02
 4.60186500e-02 4.39973669e-02 4.20649566e-02 4.02174768e-02
 3.84511734e-02 3.67624660e-02 3.51479375e-02 3.36043243e-02
 3.21285082e-02 3.07175095e-02 2.93684802e-02 2.80786979e-02
 2.68455601e-02 2.56665787e-02 2.45393752e-02 2.34616755e-02
 2.24313054e-02 2.14461863e-02 2.05043309e-02 1.96038391e-02
 1.87428944e-02 1.79197599e-02 1.71327751e-02 1.63803525e-02
 1.56609742e-02 1.49731888e-02 1.43156091e-02 1.36869084e-02
 1.30858184e-02 1.

In [74]:
slr_predict = slr.predict(X)
slr_predict

array([ 4.05145989, 11.11476742, 10.90078909, 17.96409661,  2.07922453,
        3.94447073, 11.00777825])

In [75]:
def MSE(y_pred, y):
    """
    Calculation of mean square error
    Parameters
    ----------
    y_pred : The following forms of ndarray, shape (n_samples,)
      Estimated value
    y : The following forms of ndarray, shape (n_samples,)
      Correct answer value
    Returns
    ----------
    mse : numpy.float
      Mean squared error
    """
    n_samples = y.shape[0]
    mse = np.mean((y_pred - y) ** 2)
    return mse

In [76]:
print(MSE(y, slr_predict))

0.005196009467132918


In [77]:
df = pd.read_csv('train.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'