# Lab-03: Logistic Regression

### Imports

In [1]:
from IPython.display import clear_output
!pip3 install ../Libs/plotting_funcs-0.0.1-py3-none-any.whl  --force-reinstall
clear_output()

In [2]:
import numpy as np
import pandas as pd
from math import ceil, floor
import plotly.graph_objects as go
from dataclasses import dataclass
from typing import Optional, Dict, Any
from plotting_funcs.graphs import show, list_plot, line_plot, show_multi, confusion_graph

In [3]:
import torch
from torch import nn
from tqdm import tqdm, trange
from functools import partial
from torch.utils.data import Dataset, DataLoader

In [4]:
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

### Data Exploration and Cleaning

In [5]:
titanic_data = pd.read_csv("data/titanic.csv")
titanic_data.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.3375,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,151.55,S
2,0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,S
3,0,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,S
4,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,S


In [6]:
# Check types to be sure they can be used in model
titanic_data.dtypes.to_frame().T

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,int64,int64,object,object,float64,int64,int64,float64,object


In [7]:
# Check for empty cells count
def check_nan(data: pd.DataFrame):
    count = data.where(titanic_data.isna(), 0).where(data.notna(), 1).to_numpy().sum()
    print(f"NaN values present: {count}")
    return count


check_nan(titanic_data)

NaN values present: 266.0


266.0

In [8]:
# Fill NaN types with most frequent
titanic_data.replace(np.nan, titanic_data.mode(numeric_only=False).iloc[0], inplace=True)

In [9]:
assert check_nan(titanic_data) == 0, "Data contains NaN values"

NaN values present: 0.0


In [10]:
titanic_data.drop(["name"], axis=1, inplace=True)  # Name does not affect survival probability

# one-hot-encoding for the columns [sex, embarked]
titanic_data = pd.get_dummies(titanic_data, columns=["sex", "embarked"])

In [11]:
titanic_data.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,1,0,0,0,1
1,1,1,0.92,1,2,151.55,0,1,0,0,1
2,0,1,2.0,1,2,151.55,1,0,0,0,1
3,0,1,30.0,1,2,151.55,0,1,0,0,1
4,0,1,25.0,1,2,151.55,1,0,0,0,1


In [12]:
# Confirm types
titanic_data.dtypes.to_frame().T

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,int64,int64,float64,int64,int64,float64,uint8,uint8,uint8,uint8,uint8


In [13]:
train_data = titanic_data.sample(frac=0.8)
test_data = titanic_data.drop(train_data.index)

In [50]:
class TitanicDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        input_ = torch.from_numpy(self.data.iloc[idx, 1:].to_numpy()).float()
        output_ = torch.tensor(self.data.iloc[idx, 0]).unsqueeze(0).float()
        return input_, output_

In [51]:
train_dataset = TitanicDataset(train_data)
test_dataset = TitanicDataset(test_data)

In [52]:
train_subset, val_subset = torch.utils.data.random_split(
    train_dataset,
    [ceil(len(train_dataset) * 0.8), floor(train_dataset.__len__() * 0.2)]
)

In [53]:
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=True, num_workers=0)

In [103]:
def precision(prediction: torch.Tensor, actual: torch.Tensor, threshold=0.5):
    with torch.no_grad():
        n_samples = prediction.size(dim=0)  # total number of samples
        prediction = torch.where(prediction > threshold, torch.tensor(1), torch.tensor(0))
        n_true_pos = torch.where((actual - prediction) == 0)[0].size(dim=0)  # number of true positives
        return n_true_pos / n_samples


def recall(prediction: torch.Tensor, actual: torch.Tensor, threshold=0.5):
    with torch.no_grad():
        prediction = torch.where(prediction > threshold, torch.tensor(1), torch.tensor(0))
        n_true_pos = torch.where((actual - prediction) == 0)[0].size(dim=0)  # number of true positives
        n_false_neg = torch.where((actual - prediction) > 0)[0].size(dim=0)  # number of false negatives
        return n_true_pos / (n_true_pos + n_false_neg)


def f_score(prediction: torch.Tensor, actual: torch.Tensor, beta=0.1, threshold=0.5):
    p = precision(prediction, actual, threshold)
    r = recall(prediction, actual, threshold)
    beta_2 = beta ** 2
    return (beta_2 + 1) * ((p * r) / ((beta_2 * p) + r))

In [209]:
class TorchLogisticRegression(nn.Module):
    def __init__(self, n_=1):
        super(TorchLogisticRegression, self).__init__()
        self.threshold = 0.5
        self.node = nn.Sequential(
            nn.Linear(n_, 1),
            nn.Sigmoid()
        )

    def forward(self, input_):
        return self.node(input_)

    def train_model(self, config: Optional[Dict[str, Any]], dataset: DataLoader, optimizer=None, loss_func=nn.BCEWithLogitsLoss(), n=2, device = torch.device("cpu")):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001) if optimizer is None else optimizer
        losses = []
        accuracy = 0

        for epoch in range(n):
            predictions, actual = [], []
            losses.append([])
            for (j, batch) in enumerate(dataset):
                inputs, outputs = batch
                inputs, outputs = inputs.to(device), outputs.to(device)
                actual.append(outputs)
                pred = self(inputs)
                loss = loss_func(pred, outputs)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                losses[epoch].append(loss)
                predictions.append(pred)

            acc = f_score(
                torch.vstack(predictions).to(torch.device("cpu")),
                torch.vstack(actual).to(torch.device("cpu")),
                threshold=config["threshold"]
            )
            accuracy = acc if (acc > accuracy) else accuracy

        # with tqdm(total=n, desc="epoch", postfix={"loss": torch.inf}) as tqdm_control:
        #
        #
        #         tqdm_control.update()
        #         if epoch % 10 == 0:
        #             tqdm_control.set_postfix({"loss": torch.tensor(losses).min()})

        tune.report(loss=torch.tensor(losses).min(), accuracy=accuracy)

    def test_model(self, dataset: DataLoader, threshold: float = 0.5):
        predictions, actual = [], []
        with torch.no_grad():
            for (j, batch) in enumerate(dataset):
                inputs, truth = batch
                actual.append(truth)
                pred = torch.where(self(inputs) > threshold, torch.tensor(1), torch.tensor(0))
                predictions.append(pred)

        return actual, predictions

In [236]:
device = torch.device("cpu")
config = dict(threshold=tune.uniform(0, 1))
predictor = TorchLogisticRegression(10).to(device = device)

In [237]:
predictor.train_model(dataset=train_dataloader, config=dict(threshold=0.5), n=1, device=device)

In [None]:
#GPU: 3:06
#CPU: 1:15

In [244]:
result = tune.run(
    partial(predictor.train_model, dataset=train_dataloader, n=10_000, device=device),
    resources_per_trial={"cpu": 10, "gpu": 0},
    metric="accuracy",
    mode="max",
    config=config,
    num_samples=10,
    progress_reporter=CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"]),
    checkpoint_at_end=False,
    local_dir="ray_results/",
    verbose=1
)
best_trial = result.get_best_trial(metric="accuracy", mode="max", scope="all")
print("Best trial config: {}".format(best_trial.config))

== Status ==
Current time: 2022-08-16 21:56:40 (running for 00:00:00.17)
Memory usage on this node: 11.8/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 10.0/10 CPUs, 0/0 GPUs, 0.0/7.38 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/cisimon7/Desktop/Research/MachineLearningFromScratch/Lab03:LogisticRegression/ray_results/train_model_2022-08-16_21-56-40
Number of trials: 10/10 (9 PENDING, 1 RUNNING)


== Status ==
Current time: 2022-08-16 21:56:46 (running for 00:00:06.58)
Memory usage on this node: 11.9/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 10.0/10 CPUs, 0/0 GPUs, 0.0/7.38 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/cisimon7/Desktop/Research/MachineLearningFromScratch/Lab03:LogisticRegression/ray_results/train_model_2022-08-16_21-56-40
Number of trials: 10/10 (9 PENDING, 1 RUNNING)


== Status ==
Current time: 2022-08-16 21:56:51 (running for 00:00:11.59)
Memory usage on this node: 11.9/16.0 GiB
Using FIFO scheduling algorithm.
Resou

2022-08-17 00:24:50,955	INFO tune.py:747 -- Total run time: 8890.80 seconds (8890.68 seconds for the tuning loop).


== Status ==
Current time: 2022-08-17 00:24:50 (running for 02:28:10.69)
Memory usage on this node: 11.9/16.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/7.38 GiB heap, 0.0/2.0 GiB objects
Current best trial: 297b2_00008 with accuracy=0.8044687452701681 and parameters={'threshold': 0.30233866974809265}
Result logdir: /Users/cisimon7/Desktop/Research/MachineLearningFromScratch/Lab03:LogisticRegression/ray_results/train_model_2022-08-16_21-56-40
Number of trials: 10/10 (10 TERMINATED)


Best trial config: {'threshold': 0.30233866974809265}


In [245]:
actual, predictions = predictor.test_model(test_dataloader, threshold=0.3)

In [246]:
((torch.vstack(actual) - torch.vstack(predictions)) == 0).sum()

tensor(172)

In [247]:
show(confusion_graph(
    labels=["True", "False"],
    enc_labels=[0, 1],
    actual=torch.vstack(actual).numpy().ravel(),
    prediction=torch.vstack(predictions).numpy().ravel()
))

In [248]:
from sklearn.metrics import confusion_matrix
confusion_matrix(torch.vstack(actual).numpy().ravel(), torch.vstack(predictions).numpy().ravel())

array([[172,   0],
       [ 90,   0]])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf = LogisticRegression(random_state=0).fit(x_train, y_train)
y_test_pred = clf.predict(x_test)