In [1]:
import numpy as np 
import pandas as pd
import time
import os

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import plotly.express as px

# Logistic regression class

In [40]:
class LogisticRegression(object):
    def __init__(self,
                 data_path='Homework-3/data/',  # WRITE THE PATH TO YOUR DATA HERE
                 optimizer='gd',
                 gd_lr=0.001,
                 eps=1e-8):
        # Save the optimizer
        self.optimizer = optimizer

        # Save hyperparameter settings
        self.gd_lr = gd_lr  # gd learning rate
        self.eps = eps  # epsilon, for numerical stability

        # Load the data
        # self.df = pd.read_csv(os.path.join(data_path, 'bank/bank-full.csv'), sep=";")
        self.df = pd.read_csv('data/bank/bank-full.csv', sep=";")

        # Encoding data
        cols = self.df.columns
        num_cols = self.df._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))

        ordinal_encoder = OrdinalEncoder()
        self.df[cat_cols] = ordinal_encoder.fit_transform(self.df[cat_cols])

        # split data
        self.train_data, self.test_data = train_test_split(self.df, test_size=0.2, random_state=1)
        self.train_data, self.val_data = train_test_split(self.train_data, test_size=0.25, random_state=1)

        # labels
        self.df_labels = self.df.y
        self.train_labels = self.train_data.y
        self.test_labels = self.test_data.y
        self.val_labels = self.val_data.y

        # drop the target value
        self.train_data = self.train_data.drop(['y'], axis=1)
        self.test_data = self.test_data.drop(['y'], axis=1)
        self.val_data = self.val_data.drop(['y'], axis=1)

        # transform back to numpy array
        self.train_data = self.train_data.to_numpy()
        self.test_data = self.test_data.to_numpy()
        self.val_data = self.val_data.to_numpy()
        self.train_labels = self.train_labels.to_numpy()
        self.test_labels = self.test_labels.to_numpy()
        self.val_labels = self.val_labels.to_numpy()

        # Prepend a vector of all ones to each of the dataset
        self.df["ones"] = 1
        self.train_data = np.hstack([np.ones_like(self.train_data[:, 0])[:, np.newaxis], self.train_data])
        self.val_data = np.hstack([np.ones_like(self.val_data[:, 0])[:, np.newaxis], self.val_data])
        self.test_data = np.hstack([np.ones_like(self.test_data[:, 0])[:, np.newaxis], self.test_data])

        # Initialize the weight matrix
        np.random.seed(seed=42)
        self.start_w = np.random.rand(self.train_data.shape[1])

        # Initialize the train logs
        self.train_logs = {'train_accuracy': [], 'validation_accuracy': [], 'train_loss': [], 'validation_loss': []}

    def sigmoid(self, a):
        """
        inputs:
            v: float

        returns:
            the logistic sigmoid evaluated at a
        """
        # WRITE CODE HERE
        return 1 / (1 + np.exp(-a))

    def forward(self, w, X):
        """
        inputs: w: an array of the current weights, of shape (d,)
                X: an array of n datapoints, of shape (n, d)

        outputs: an array of the output of the logistic regression (not 0s and 1s yet)
        """
        # WRITE CODE HERE
        return self.sigmoid(np.dot(X, w))

    def loss(self, w, X, y):
        """
        inputs: w: an array of the current weights, of shape (d,)
                X: an array of n datapoints, of shape (n, d)

        outputs: the loss. This is exactly the negative log likelihood
        """
        # WRITE CODE HERE
        # Note: add self.eps to a value before taking its log
        E = 10 ** (-8)
        y_pred = self.sigmoid(np.dot(X, w))
        cost = -np.sum(y * np.log(y_pred + E) + (1 - y) * np.log(1 - y_pred + E))
        return cost

    def gradient(self, w, X, y):
        """
        inputs:
            w: an array of the current weights

        returns:
            an array representing the gradient of the loss
        """
        # WRITE CODE HERE
        grad = np.dot(X.T, (self.sigmoid(np.dot(X, w)) - y))
        print(np.linalg.norm(grad))
        return grad

    def gd_step(self, w, X, y):
        """
        inputs:
            w: an array of the current weights

        returns:
            a vector of weights updated according to a step of gradient descent
            on the whole train dataset, using the learning rate self.gd_lr
        """
        # WRITE CODE HERE
        grad = self.gradient(w, X, y)
        wnext = w - self.gd_lr * grad
        return wnext

    def compute_average_loss_and_accuracy(self, w, X, y):
        outputs = self.forward(w, X)
        predictions = np.array(np.round(outputs), dtype=int)
        accuracy = np.mean(y == predictions)
        loss = self.loss(w, X, y) / X.shape[0]
        return loss, accuracy, predictions

    def predict(self, X, w):
        y_predict = self.forward(X, w)
        return y_predict
    
    
    def train_loop(self, n_epochs):
        y_p = 1
        N = 10
        err = 0.1

        # update constants
        c = 1.1
        c1 = 0.9
        
        y_sol = [y_p]
        y_bar = [y_p]

        # starting weight
        w = np.array(np.copy(self.start_w), dtype=np.float128)

        for k in range(n_epochs):

            df = self.df.sample(n=N, random_state=1)
            X = df.drop(['y'], axis=1).to_numpy()
            y = df.y.to_numpy()


            # solve
            # if np.linalg.norm(self.gradient(w, X, y)) <= err:
            if 1==1:
#                 y = self.predict(w, X)
#                 y_sol.append(y)

#                 y_p = np.dot(y_sol, w) / np.sum(w)
#                 y_bar.append(y_p)

                df_loss, df_accuracy, _ = self.compute_average_loss_and_accuracy(w, X, y_p)
                # print("accuracy:", df_accuracy, '   loss: ', df_loss)

                # updates
                w = self.gd_step(w, X, y)
            
            # print(np.linalg.norm(self.gradient(w, X, y)))
        
            N *= c
            N = int(N)
            err *= c1
        
        return w

In [41]:
# def train_loop(self, n_epochs):
#         w = np.array(np.copy(self.start_w), dtype=np.float128)
        
#         c = 1.1
#         N = 10
#         subAlpha = 0.999
#         ErrorThreshold = 0.1
        
#         # Choose an optimizer
#         opt_step = self.gd_step

#         for epoch in range(n_epochs):
#             print(f"epoch: {epoch},     sample size: {N}")
            
#             df = self.df.sample(n=N, random_state=1)
#             X = df.drop(['y'], axis=1).to_numpy()
#             y = df.y.to_numpy()
            
#             ErrorPercent = 1.0
            
#             while(ErrorPercent >  ErrorThreshold):
#                 for i in range(N):
#                     batchX = X[i][:]
#                     batchy = y[i]
                    
#                     cache = self.forward(w, batchX)
#                     grads = self.gradient(w, batchX, batch)
            
#                 # GD
#                 w = opt_step(w, X, y)

#                 df_loss, df_accuracy, _ = self.compute_average_loss_and_accuracy(w, X, y)

#                 # Changer l'erreur selon la sous-itération avec le sous échantions de cette itération.
#                 ErrorPercent = 1.0 - df_accuracy 
#                 # Modifier le taux d'apprentissage.
#                 if(ErrorPercent  >  ErrorThreshold):
#                     subAlpha *= .98888888
            
#             if train_loss > previousLoss:
#                 alpha  = alpha  *  .923
#                 previousLoss = train_lossAll
        
#             # Augmenter le sampleSize et diminuer l'erreur voulu
#             N += 100 
        
#             #  J'ai réaliser que ça prennait trop de temps a converger lorsque le seuil d'erreur devenait trop petit. 
#             if ErrorThreshold > 0.04:
#                 ErrorThreshold  *= 95/100
            
            
#             print("accuracy:", df_accuracy, '   loss: ', df_loss)
            
#             # update sample
#             # N = min(int(c * N), self.df.shape[0])
        
#         return w

In [None]:
mean squared error: sqrt(|y_predictinon^2 - y^2|)
log likelihood: grad(cost)

# main

In [42]:
if __name__ == "__main__":
    # WRITE CODE HERE
    # Instantiate, train, and evaluate your classifiers in the space below
    LR = LogisticRegression()
    start = time.time()
    LR.train_loop(100)
    end = time.time()
    print('Execution Time: ', end - start)

11423.501520987336919
450.01147080725475963
450.00888880109932996
450.0088888010991278
450.0088888010991278
450.0088888010991278
702.85987223628011017
702.85987223628011017
702.85987223628011017
1379.3367971601424655
1379.3362117815227411
1352.1338691120787098
1352.1338691120787098
2278.5124533344117757
25356.465861787600467
2727.7177273317706692
2727.7177273317706692
2727.7177260796980618
5606.634819568686694
5640.796131753034735
5812.4277199806966716
12823.917147268221432
69704.12636279121485
16208.098377045963624
16179.4036045832047845
17806.113276063364413
17806.113276063364413
102670.01410825634958
25438.861609749757038
26783.255067299045532
26951.381652894903118
31293.155881118797764
149198.89081024426274
47073.228378771728345
50139.620730516101503
49789.87294621266655
185309.59712060247205
76576.9981848335447
76549.1102952869074
76843.22709883031882
328286.29833881279427
82077.644428918645715
89206.97450872325151
97464.27099198967486
319235.89294601149936
110432.90575276917887
1

  return 1 / (1 + np.exp(-a))


675580.9442272628222
720924.24711754563094
788226.11943908582964
885101.5916084916152
4812624.3878519569535
1052478.8159450051373
1135269.6856769320078
1226828.4821326899729
1308533.852571569493
7625887.9072688971696
1609779.6842540285953
1783178.5795516948167
1936262.0892782051266
2121522.098072277917
11546238.0500584991205
2528162.9160811610955
2769716.1577912636353
3037236.4196835253938
3289914.764911243107
19622109.36942735976
4172191.6605259159642
4568443.9948076412857
5118207.658731423531
5628188.234729919363
3712727.7170039777586
6373154.2566610609224
38904009.266994256668
8594568.746923838528
9561285.87368524572
10611243.398676942987


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
 def train_loop(self):
        w = np.array(np.copy(self.start_w), dtype=np.float128)
        
        c = 1.1
        N = 20
        
        # Choose an optimizer
        opt_step = self.gd_step
        
        # Error
        ErrorTreshold = 0.1
        Error = 1
        epoch = 0
        
        acc = []
        err = []
        ep = []
        while Error > ErrorTreshold and N < self.df.shape[0]:
        # for epoch in range(n_epochs):
        
            print(f"epoch: {epoch},     sample size: {N}")
            epoch += 1
            df = self.df.sample(n=N, random_state=1)
            X = df.drop(['y'], axis=1).to_numpy()
            y = df.y.to_numpy()
            
            # GD
            w = opt_step(w, X, y)
            
            df_loss, df_accuracy, _ = self.compute_average_loss_and_accuracy(w, X, y)
            Error = 1 - df_accuracy
            print(f"Error: {Error}")
            
            err.append(df_loss)
            acc.append(df_accuracy)
            ep.append(epoch)
            
            print("accuracy:", df_accuracy, '   loss: ', df_loss)
            
            # update sample
            N = min(int(c * N), self.df.shape[0])
            
            if ErrorTreshold > 0.04:
                ErrorTreshold  *= 95/100
                
        print(f"itteration number: {epoch}")
        
        
        fig = px.scatter(x=ep, y=err)
        fig.show()
        
        return w