In [1]:
import pandas as pd
import numpy as np

### Load Data

In [2]:
# open the file
df = pd.read_csv('data/titanicdata.csv', header=0)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,2,22.0
1,1,1,1,38.0
2,1,3,1,26.0
3,1,1,1,35.0
4,0,3,2,35.0


## normalize data

In [3]:
# normalize the data using the formula (x - min(x)) / (max(x) - min(x))
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [4]:
df_standardized = normalize(df)
df_standardized.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0.0,1.0,1.0,0.271174
1,1.0,0.0,0.0,0.472229
2,1.0,1.0,0.0,0.321438
3,1.0,0.0,0.0,0.434531
4,0.0,1.0,1.0,0.434531


In [5]:
#to numpy array
df_standardized_array = df_standardized.to_numpy()

# Classifier

In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [19]:
from tqdm import tqdm

class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iter=1000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        # Initialize weights and bias
        self.weights = np.zeros(X.shape[1])
        self.bias = 0
        
        
        # Iterate over the number of maximum iterations
        for _ in range(self.max_iter):
            # Compute the dot product of the input features and the weights
            z = np.dot(X, self.weights) + self.bias

            # for numeric stability
            #z -= np.max(z)

            # Compute the conditional log-likelihood of the data
            log_likelihood = np.sum(y * (z)) - np.sum(np.log(1 + np.exp(z)))

            # compute loss
            loss = -np.sum(y * np.log(sigmoid(z)) + (1 - y) * np.log(1 - sigmoid(z)))

            # print loss
            #print("loss is", loss)
            
            # Compute the gradient of the weights and bias
            gradient_bias    = np.sum(y - ((np.exp(z)) / (1 + np.exp(z))))
            gradient_weights = ((X.T * (y - ( (np.exp(z)) / (1 + np.exp(z)) ) )).T).sum(axis=0)
       
            # Update the weights and bias
            self.weights += self.learning_rate * gradient_weights
            self.bias += self.learning_rate * gradient_bias

    def predict(self, X):
        # Compute the dot product of the input features and the weights
        z = np.dot(X, self.weights) + self.bias
        
        # Apply the sigmoid function to the dot product to get the predictions
        y_pred = sigmoid(z)
        
        # Return the predictions as a class label (0 or 1)
        return np.round(y_pred).astype(int)


## Split data into training, validation, and test sets

In [8]:
# Split the data into three random sets, 60% as train, 20% validation and 20% as test
train, validate, test = np.split(df_standardized_array, [int(.6*len(df_standardized_array)), int(.8*len(df_standardized_array))])

In [9]:
# first feature is the label
train_X, train_y = train[:, 1:], train[:, 0]
validate_X, validate_y = validate[:, 1:], validate[:, 0]
test_X, test_y = test[:, 1:], test[:, 0]

In [10]:
# Train the model
model = LogisticRegression(learning_rate=0.001, max_iter=50)
model.fit(train_X, train_y)
# get accuracy

100%|██████████| 50/50 [00:00<00:00, 8371.53it/s]

loss is 370.14059441901077
loss is 351.5877922304335
loss is 340.51969689325733
loss is 333.37775603299906
loss is 328.33642101621876
loss is 324.4581046433018
loss is 321.2528215167764
loss is 318.45876881837285
loss is 315.93279451420983
loss is 313.59473685182337
loss is 311.39851744877626
loss is 309.3168111087536
loss is 307.33276462639225
loss is 305.4354533724686
loss is 303.6173564443444
loss is 301.8729387628625
loss is 300.1978478720862
loss is 298.58845556866106
loss is 297.041594536164
loss is 295.55440595608667
loss is 294.124250582497
loss is 292.74865624084487
loss is 291.4252862858493
loss is 290.1519201405564
loss is 288.92644080729065
loss is 287.74682640615083
loss is 286.61114404416287
loss is 285.517545038475
loss is 284.46426093329643
loss is 283.4495999908884
loss is 282.47194397577186
loss is 281.5297451311914
loss is 280.6215232925557
loss is 279.7458631084575
loss is 278.9014113543209
loss is 278.0868743315873
loss is 277.3010153494655
loss is 276.542652288264




In [11]:
# get train accuracy
train_pred = model.predict(train_X)
train_accuracy = np.sum(train_pred == train_y) / len(train_y)
train_accuracy

0.799625468164794

# Hyperparameters Tuning

In [24]:
learning_rates = [0.0001, 0.001, 0.01, 0.1]
max_iters = [10, 50, 100, 1000]

for learning_rate in learning_rates:
    for max_iter in max_iters:
        model = LogisticRegression(learning_rate=learning_rate, max_iter=max_iter)
        model.fit(train_X, train_y)
        validate_pred = model.predict(validate_X)
        validate_accuracy = np.sum(validate_pred == validate_y) / len(validate_y)
        print("learning rate is", learning_rate, "max iter is", max_iter, "validation accuracy is", validate_accuracy)

learning rate is 0.0001 max iter is 10 validation accuracy is 0.601123595505618
learning rate is 0.0001 max iter is 50 validation accuracy is 0.601123595505618
learning rate is 0.0001 max iter is 100 validation accuracy is 0.601123595505618
learning rate is 0.0001 max iter is 1000 validation accuracy is 0.7359550561797753
learning rate is 0.001 max iter is 10 validation accuracy is 0.601123595505618
learning rate is 0.001 max iter is 50 validation accuracy is 0.7359550561797753
learning rate is 0.001 max iter is 100 validation accuracy is 0.7359550561797753
learning rate is 0.001 max iter is 1000 validation accuracy is 0.7359550561797753
learning rate is 0.01 max iter is 10 validation accuracy is 0.7359550561797753
learning rate is 0.01 max iter is 50 validation accuracy is 0.7359550561797753
learning rate is 0.01 max iter is 100 validation accuracy is 0.7359550561797753
learning rate is 0.01 max iter is 1000 validation accuracy is 0.7359550561797753
learning rate is 0.1 max iter is 10

In [29]:
# combine train and validation data
train_valid_X = np.concatenate((train_X, validate_X), axis=0)
train_valid_y = np.concatenate((train_y, validate_y), axis=0)

# train the model with train and validation data
model = LogisticRegression(learning_rate=0.001, max_iter=1000)
model.fit(train_valid_X, train_valid_y)

# predict on test data
test_pred = model.predict(test_X)
test_accuracy = np.sum(test_pred == test_y) / len(test_y)
test_accuracy


0.8044692737430168