<a href="https://colab.research.google.com/github/chaitanyaprsd1/public/blob/main/Model_Log_Regression_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing all necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Splitting the dataset into training and validation sets in the ratio of 4:1

In [2]:
def split_dataset(X_data, Y_data):

    train_split_X=[]
    validation_split_X=[]
    train_split_Y=[]
    validation_split_Y=[]
    divisor = 5
    
    num_rows = X_data.shape[0]
    inds = list(range(num_rows))

    rem = np.remainder(inds, divisor)

    for i in range(len(inds)):
        
        if rem[i] != 4:
            train_split_X.append(X_data[i])
            train_split_Y.append(Y_data[i])

        else:
            validation_split_X.append(X_data[i])
            validation_split_Y.append(Y_data[i])


    trainset_X = np.asarray(train_split_X)
    trainset_Y = np.asarray(train_split_Y)
    validset_X = np.asarray(validation_split_X)
    validset_Y = np.asarray(validation_split_Y)

    return (trainset_X, trainset_Y, validset_X, validset_Y)    

Functions implementing Logistic Regression

In [3]:
def softmax(z):
    temp = np.sum(np.exp(z),axis=1)
    return np.exp(z) / temp.reshape(-1,1)

def pred_softmax(z):
    temp = np.sum(np.exp(z))
    return np.exp(z) / temp
 
def cost_function(targets, predictions):
    epsilon = 0.00001
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    cost = -np.sum(targets*np.log(predictions))/N
    return cost

def gradient_descent(X, prob, Y):
    return np.matmul(X.T, prob - Y)


def logistic_regression(X, Y, alpha):
    iterations = 10000000
    N = X.shape[0]
    d = X.shape[1]

    weights = np.ones((Y.shape[1], d))

    Lvals = []

    for iter in range(iterations):
        sigma = softmax(np.matmul(X, weights.T))
        L = cost_function(Y, sigma) # computes average loss for whole dataset
        Lvals.append(L)
                
        dw = gradient_descent(X, sigma, Y)

        weights = weights - alpha * dw.T/X.shape[0]
        
        #  Iteration stopping criterion      
        if len(Lvals) > 3:
            if iter % 1000 == 0:
                print('running iteration {} at a loss of {}'.format( iter, Lvals[-1]))
            if iter % 2 == 0:
                if abs(Lvals[-1] - Lvals[-2]) < 0.00001:
                    if abs(Lvals[-1] - Lvals[-3]) < 0.00001:
                        print("Successfully fitted to training data after {} iterations".format(iter))
                        return weights, Lvals
                        break            
    
def predict_class(X, weights):
    N = X.shape[0]
    pred = []
    for i in range(N):
        XiHat = X[i]
        qi = pred_softmax(np.matmul(XiHat, weights.T))
        classes = np.argmax(qi)
        pred.append(classes)
    print(pred)
    return pred

def error_count(pred_list, Y, alpha):
    cnt = 0
    for i in range(len(pred_list)):
        y = np.where(Y[i] == 1)
        if pred_list[i] == y:
            cnt += 1        
    print('Training Accuracy with learning rate of {} -> {}'.format(alpha, cnt / len(pred_list)))

Downloading training data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
######################################################################################################################
# download training data X
df_train_X = pd.read_csv(r'/content/drive/MyDrive/Data_Kaggle1/train.csv')
df_train_X = df_train_X.iloc[:, :-1]
bias = np.ones((df_train_X.shape[0], 1))
df_train_X = np.concatenate((bias, df_train_X), axis=1) # Dummy column of 1s added to feature matrix for bias

# download training labels Y
df_Y = pd.read_csv(r'/content/drive/MyDrive/Data_Kaggle1/train_result.csv')
df_Y = df_Y.iloc[:, -1]
df_Y.to_numpy()
Y_onehot = pd.get_dummies(df_Y).values # One hot encoding of training labels

Data split to estimate training and validation error rate

In [6]:
X_train = split_dataset(df_train_X, Y_onehot)[0]
y_train = split_dataset(df_train_X, Y_onehot)[1]
X_valid = split_dataset(df_train_X, Y_onehot)[2]
y_valid = split_dataset(df_train_X, Y_onehot)[3]

In [7]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(40000, 1569) (40000, 19) (10000, 1569) (10000, 19)


In [8]:

############################################## EXECUTION ############################################################
#alpha = [0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] <--------------- iterate over various learning rates
alpha = [0.2]

weights, Lvals = logistic_regression(X_train, y_train, alpha)

# Predicting training accuracy
pred_train = predict_class(X_train, weights)
error_count(pred_train, y_train, alpha)

# Predicting validation accuracy
pred_valid = predict_class(X_valid, weights)
error_count(pred_valid, y_valid, alpha)

running iteration 10 at a loss of 2.677244643875199
running iteration 20 at a loss of 2.5980043315521146
running iteration 30 at a loss of 2.543982728863315
running iteration 40 at a loss of 2.504185384516139


KeyboardInterrupt: ignored

Kaggle Submission

In [None]:
df_test_X = pd.read_csv(r'/content/drive/MyDrive/Data_Kaggle1/test.csv')
df_test_X = df_test_X.iloc[:, :-1]
list_index = list(df_test_X.index.values)
test_set_size = df_test_X.shape[0]
bias_test = np.ones((df_test_X.shape[0], 1))  # Dummy column of 1s added to feature matrix for bias
df_test_X = np.concatenate((bias_test, df_test_X), axis=1)    
pred_test = predict_class(df_test_X, weights)

In [None]:
# out = pd.DataFrame({'Index': list_index,'Class':pred_test})
# filename = 'LGR_LR0.2.csv'
# out.to_csv(filename, index=False)
# print('Saved file: ' + filename)