# Loading the data

First, we are going to load the data that we'll feed to our alorithm in order for it to learn. We have 3 .csv files to import: <br> *x_train.csv*, *x_test.csv* and *y_train.csv*.

In [17]:
import numpy as np
import matplotlib.pyplot as plt

x_train = np.genfromtxt("x_train.csv", dtype = float, delimiter=',', skip_header=1)
# We have to remove all columns containing nan elements
x_train = x_train[:, ~np.isnan(x_train).any(axis=0)]

x_test = np.genfromtxt("x_test.csv", dtype=float, delimiter=',', skip_header=1)
x_test = x_test[:, ~np.isnan(x_test).any(axis=0)]
y_train = np.genfromtxt("y_train.csv", dtype=int, delimiter=',', skip_header=1)

In [23]:
# We should standardize the data to prevent the gradient from exploding 
x_train = (x_train - np.mean(x_train, axis=0))/np.std(x_train, axis=0)

# Implementing the ML methods

## 1.1 Linear regression using gradient descent 


In [24]:
# First we have to define the function that computes the MSE loss
def compute_mse_loss(y, tx, w):
    error = y - np.dot(tx, w)
    loss = 1/(2*np.shape(error)[0])*np.dot(error.T, error)
    
    return loss

# Function that computes the gradient
def compute_gradient(y, tx, w):
    error = y - np.dot(tx, w)
    grad = (-1/np.shape(error)[0])*np.dot(tx.T, error)
    
    return grad   

In [25]:
# Linear regression using gradient descent 
def mean_squared_error_gd(y, tx, initial_w,  max_iters, gamma):
    #Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    
    for n_iter in range(max_iters):
        
        grad = compute_gradient(y, tx, w)
        loss = compute_mse_loss(y, tx, w)
        w = w - gamma * grad
        
        #Store w and loss
        ws.append(w)
        losses.append(loss)
        
    return losses, ws # NOTE: Razlikuje se u tome sto ovo nase treba da vrati samo poslednje vrednosti

In [34]:
# We try to train the model - TREBA RAZMISLITI O INICIJALNIM VREDNOSTIMA
max_iters = 50
gamma = 0.1
initial_w = 0.01 * np.random.randn(np.shape(x_train)[1])

losses, ws = mean_squared_error_gd(y_train, x_train, initial_w, max_iters, gamma)
print(losses)

[0.046366715974486265, 0.04368181148825297, 0.042432446271849444, 0.04168683032030336, 0.04118947772886081, 0.0408386452302426, 0.040582363561013426, 0.040390388411048236, 0.04024377689983333, 0.04013006438565057, 0.040040737540258865, 0.03996980639676781, 0.039912952782801894, 0.0398670004547073, 0.03982957357769344, 0.03979886987873761, 0.0397735059688606, 0.03975240941082926, 0.039734741848084815, 0.03971984325463323, 0.039707190851649046, 0.03969636840592636, 0.03968704300614278, 0.039678947310315224, 0.039671865852655364, 0.039665624399756856, 0.039660081622269965, 0.03965512254141479, 0.039650653347011816, 0.03964659728279238, 0.039642891367248585, 0.03963948377199263, 0.03963633171984127, 0.03963339979530774, 0.039630658583446846, 0.039628083570904166, 0.039625654256897916, 0.03962335343267506, 0.039621166596458104, 0.03961908147756956, 0.03961708764868714, 0.03961517620936133, 0.03961333952724815, 0.03961157102615587, 0.03960986501212131, 0.039608216530424435, 0.039606621247808

## 1.2 Linear regression using stochastic gradient descent 

In [30]:
# For stochastic gradient we only need one additional function
def compute_stoch_gradient(y, tx, w):
    error = y - np.dot(tx, w)
    grad = (-1/np.shape(error)[0])*np.dot(tx.T, error)

    return grad

In [31]:
# Linear regression using stochastic gradient descent - batch_size = 1
def stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma):
    #Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        # We take one random sample for SGD
        index = np.random.randint(np.shape(tx)[0])
        x_sample = tx[index, :]
        y_sample = y[index]
        loss = compute_mse_loss(y, tx, w)
        grad = compute_stoch_gradient(y_sample, x_sample, w)
        w = w - gamma*grad
        
        ws.append(w)
        losses.append(loss)
    
    return losses, ws # NOTE: Razlikuje se u tome sto ovo nase treba da vrati samo poslednje vrednosti

In [32]:
# We define all hyperparameters - TREBA DA PRODISKUTUJEMO
max_iters = 50
gamma = 0.1
initial_w = 0.01 * np.random.randn(np.shape(x_train)[1])

losses, ws = mean_squared_error_gd(y_train, x_train, initial_w, max_iters, gamma)
print(losses)

[0.04970046680477558, 0.04516988650738086, 0.04346491143118687, 0.04248589148259589, 0.04181818158619397, 0.041334357901294916, 0.04097345896285917, 0.040698995113775976, 0.04048713699523808, 0.040321641698876094, 0.04019109884945199, 0.040087290747503994, 0.04000417258150845, 0.03993721832203877, 0.03988298918315943, 0.03983884111787981, 0.03980272142763743, 0.039773023997118864, 0.03974848410500233, 0.0397281006188899, 0.03971107757559965, 0.03969677976723816, 0.03968469862760519, 0.039674425809237926, 0.03966563257586407, 0.03965805363880997, 0.03965147441900715, 0.039645720968644986, 0.03964065197011899, 0.039636152365521586, 0.03963212827137046, 0.03962850291001281, 0.039625213347731225, 0.03962220787465692, 0.039619443896507484, 0.039616886235353005, 0.03961450575788494, 0.03961227826637036, 0.03961018360064399, 0.03960820490990409, 0.03960632806132991, 0.039604541159099726, 0.039602834152607215, 0.03960119851684186, 0.03959962699122612, 0.03959811336586703, 0.03959665230631481, 