In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import csv
import os
import pickle

import warnings
warnings.filterwarnings('ignore')

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.optim as optim

from tqdm.notebook import tqdm


# Data 

In [3]:
def create_dir(dir):
    if os.path.exists(dir):
        pass
    else:
        os.makedirs(dir)

def load_data(type, dir='modified_training_data'):
    search_dir = os.path.join(dir, type)

    if os.path.exists(search_dir):
        x_data_path = os.path.join(search_dir, 'x_train.csv')
        y_data_path = os.path.join(search_dir, 'y_train.csv')

        x_data = pd.read_csv(x_data_path)
        y_data = pd.read_csv(y_data_path)

        if 'id' in x_data.columns:
            x_data.set_index('id') 
        if 'id' in y_data.columns:
            y_data.set_index('id')

        return x_data, y_data

In [4]:
# loading data 
X_base, y_base = load_data('base')
X_test, y_test = load_data('test')
X_over, y_over = load_data('oversampling')
X_under, y_under = load_data('undersampling')
X_smote, y_smote = load_data('smote')

In [5]:
# Shape of each of them 
train_list = [X_base, y_base ,X_test, y_test, X_over, y_over, X_under, y_under, X_smote, y_smote]
list_name = ['X_base', 'y_base', 'X_test', 'y_test', 'X_over', 'y_over', 'X_under', 'y_under', 'X_smote', 'y_smote']

for name, data in zip(list_name,train_list):
    print(f"{name} : {data.shape}")

X_base : (675000, 16)
y_base : (675000, 1)
X_test : (75000, 16)
y_test : (75000, 1)
X_over : (1319024, 16)
y_over : (1319024, 1)
X_under : (180976, 16)
y_under : (180976, 1)
X_smote : (1187122, 16)
y_smote : (1187122, 1)


Experimenting with X_train (over) for starters

In [6]:
X_train = torch.tensor(X_over.values, dtype=torch.float32)
y_train = torch.tensor(y_over.values, dtype=torch.float32).reshape(-1, 1)

In [7]:
X_test_exp = torch.tensor(X_test.values, dtype=torch.float32)
y_test_exp = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

In [8]:
X_train.shape, y_train.shape

(torch.Size([1319024, 16]), torch.Size([1319024, 1]))

# Neural Network

In [9]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


In [10]:
# Model break down
# l0 = nn.Flatten()
l1 = nn.Linear(16, 32)
l2 = nn.Linear(32, 64)
l3 = nn.Linear(64, 1)

# Layer list
layer_list = [l1, l2, l3]

input_tensor = X_train

for l_num, layer in enumerate(layer_list):
    print(f"Layer {l_num + 1}: ")
    print(f'    input: {input_tensor.shape}')
    input_tensor = layer(input_tensor)
    print(f'    output: {input_tensor.shape}')

Layer 1: 
    input: torch.Size([1319024, 16])
    output: torch.Size([1319024, 32])
Layer 2: 
    input: torch.Size([1319024, 32])
    output: torch.Size([1319024, 64])
Layer 3: 
    input: torch.Size([1319024, 64])
    output: torch.Size([1319024, 1])


In [40]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [41]:
X = torch.rand(675000, 16)
model = NeuralNetwork()
logits = model(X)
# pred_probab = nn.Sigmoid()(logits)
y_pred = logits.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([0, 0, 0,  ..., 0, 0, 0])


In [42]:
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [43]:
y_pred.shape, y_train.shape

(torch.Size([675000]), torch.Size([1319024, 1]))

In [44]:
def save_epoch_data(epoch_data, csv_file_path="training-log/nn_base_2.csv"):
    if os.path.exists(csv_file_path):
        with open("training-log/nn_base_2.csv", 'a') as csv_file:
            csvwriter = csv.writer(csv_file)   
            csvwriter.writerow(epoch_data)
    else:
        with open("training-log/nn_base_2.csv", 'w') as csv_file:
            csvwriter = csv.writer(csv_file)
            header_row = ['Epoch', 'train-loss']
            csvwriter.writerow(header_row)
            csvwriter.writerow(epoch_data)


In [45]:
loss_fn = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [47]:
n_epochs = 100
batch_size = 32

X_val = X_test_exp
y_val = y_test_exp

history = []
 
for epoch in range(n_epochs):
    for i in range(0, len(X), batch_size):
        Xbatch = X_train[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train[i:i+batch_size]

        loss = loss_fn(y_pred, ybatch)

        epoch_data = [
            epoch+1, loss.item()
        ]
        save_epoch_data(epoch_data)
        history.append(epoch_data)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}, train-loss: {loss:.4f}')

Epoch: 1, train-loss: 0.0856
Epoch: 2, train-loss: 0.0826
Epoch: 3, train-loss: 0.0817
Epoch: 4, train-loss: 0.0788
Epoch: 5, train-loss: 0.0816
Epoch: 6, train-loss: 0.0772


KeyboardInterrupt: 

In [None]:
# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_pred = model(X_test_exp)
 
accuracy = (y_pred.round() == y_test_exp).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.915453314781189


In [None]:
y_pred

tensor([[0.8310],
        [0.0017],
        [0.4578],
        ...,
        [0.0775],
        [0.1478],
        [0.1387]])

In [None]:
torch.tensor([0.500]).round()

tensor([0.])

In [None]:
def save_result_data(result_data, csv_file_path="training-log/model_nn_result.csv"):
    if os.path.exists(csv_file_path):
        with open(csv_file_path, 'a') as csvfile:
            csvwriter = csv.writer(csvfile)   
            csvwriter.writerow(result_data)

    else:
        with open(csv_file_path, 'w') as csvfile:
            csvwriter = csv.writer(csvfile)   
            csvwriter.writerow([
                'Name', 'Epochs','batch_size', 'Loss_fn','Optimizer', 'LR', 
                'Train Data Type', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC_AUC Score'
            ])
            csvwriter.writerow(result_data)

    csvfile.close()

In [None]:
result = [   
    'nn_base',
    50,
    32,
    'BCE Loss',
    'Adam',
    0.001,
    'OverSampling',
    accuracy_score(y_test_exp, y_pred.round()),
    precision_score(y_test_exp, y_pred.round()),
    recall_score(y_test_exp, y_pred.round()),
    f1_score(y_test_exp, y_pred.round()),
    roc_auc_score(y_test_exp, y_pred.round())
]

save_result_data(result_data=result)
print(result)


['nn_base', 50, 32, 'BCE Loss', 'Adam', 0.001, 'OverSampling', 0, 0, 0, 0, 0]


In [None]:
model_name = 'nn_base'
torch.save(model.state_dict(), f'model/{model_name}')