In [45]:
# import tools

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [46]:
# dataset class

class JaccardDataset(Dataset):
    def __init__(self, csvpath):
        df = pd.read_csv(csvpath)
        df = df[["jaccard_coeff", "1hop_jaccard_coeff", "bm_edge_probability", "y_true"]].values
        self.inpt = df[:,0:3]   
        self.oupt = df[:,[3]]
    def __len__(self):
        return (len(self.oupt))
    def __getitem__(self, idx):
        x = self.inpt[idx]  
        y = self.oupt[idx]
        return {'data': x,
                'target': y}

In [47]:
# simple, linear, feed-foward nn with some hidden layers and relu activation

class Network(nn.Module):

    def __init__(self):
        super().__init__()

        # increases then decreases dimensionality
        self.fc1 = nn.Linear(3, 5)
        self.fc2 = nn.Linear(5, 8)
        self.fc3 = nn.Linear(8, 6)
        self.fc4 = nn.Linear(6, 4)
        self.fc5 = nn.Linear(4,1)

    def forward(self,x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)

        return x

In [48]:
# function to train model

def train(model, train_loader, criterion, optimizer, dataset_size, train_loss=0, correct=0, train_accuracy=0):
    with tqdm(total=len(train_loader),
                desc='Train Epoch     #{}'.format(epoch + 1),
                disable=False) as t:
        for batch_idx, batch in enumerate(train_loader):
            x_train, y_train = batch['data'].float(), batch['target'].float()
            
            # for debugging purposes
            #print('x_train:', x_train)
            #print('y_train:', y_train)
            #print('x_train size:', len(x_train))
            #print('y_train size:', len(y_train))
            
            optimizer.zero_grad()
            output = model(x_train)
            loss = criterion(output, y_train)
            train_loss += loss
            loss.backward()
            optimizer.step()
            for i in range(len(y_train)):
                out, y = output[i], y_train[i]
                out = torch.round(out)
                if (out == y):
                    correct += 1
            train_accuracy += correct/dataset_size
            correct = 0
            t.set_postfix({'loss': train_loss, 'accuracy': 100. * train_accuracy})
            t.update(1)
    log_writer.add_scalar('train_loss', train_loss, epoch)
    log_writer.add_scalar('train_accuracy', train_accuracy, epoch)
            

In [49]:
# function to validate  model 

def val(final_validation_input, final_validation_output, final_validation_target, epoch, EPOCHS, model, test_loader, 
        criterion, dataset_size, val_loss=0, correct=0, val_accuracy=0):
    with tqdm(total=len(test_loader),
              desc='Validate Epoch  #{}'.format(epoch + 1),
              disable=False) as t:
        with torch.no_grad():
            for batch_idx, batch in enumerate(test_loader):
                x_test, y_test = batch['data'].float(), batch['target'].float()
                output = model(x_test)
                loss = criterion(output, y_test)
                val_loss += loss
                for i in range(len(y_test)):
                    out, y = output[i], y_test[i]
                    out = torch.round(out)
                    if (out == y):
                        correct += 1
                    if ((epoch+1) == EPOCHS):
                        x = x_test[i]
                        final_validation_input.append(x)
                        final_validation_output.append(out)
                        final_validation_target.append(y)
                        #log_writer.add_scalar('val_output', out, count)
                        #count += 1
                val_accuracy += correct/dataset_size
                correct = 0
                t.set_postfix({'loss': val_loss, 'accuracy': 100. * val_accuracy})
                t.update(1)
    log_writer.add_scalar('val_loss', val_loss, epoch)
    log_writer.add_scalar('val_accuracy', val_accuracy, epoch)

In [50]:
# main function

# parameters
EPOCHS = 100
BATCH_SIZE = 64
learning_rate = 0.01
shuffle_data = True
seed = 42

# tensorboard setup
log_writer = SummaryWriter('100epochrun')

# create and load data
dataset = JaccardDataset('/gpfs/alpine/proj-shared/gen150/marie/smc2021/data/neural_network_training_data.csv')

train_size = int(0.8 * len(dataset))
print('dataset size:', len(dataset))
print('train_size:', train_size)
test_size = len(dataset) - train_size
print('test_size', test_size)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size], 
                                                            generator=torch.Generator().manual_seed(seed))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=shuffle_data)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=shuffle_data)

# build model
model = Network()

# loss function and optimizer function
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

final_validation_input = []
final_validation_output = []
final_validation_target = []

# train and test model
for epoch in range (EPOCHS):
    train(model, train_loader, criterion, optimizer, train_size)
    val(final_validation_input, final_validation_output, final_validation_target, epoch, EPOCHS, model, test_loader, 
        criterion, test_size)

# close tensorboard
log_writer.close()

dataset size: 124296
train_size: 99436
test_size 24860


Train Epoch     #1: 100%|██████████| 1554/1554 [00:05<00:00, 293.26it/s, loss=tensor(180.8346, grad_fn=<AddBackward0>), accuracy=84.3]
Validate Epoch  #1: 100%|██████████| 389/389 [00:00<00:00, 431.81it/s, loss=tensor(39.8038), accuracy=86.4]
Train Epoch     #2: 100%|██████████| 1554/1554 [00:05<00:00, 293.00it/s, loss=tensor(154.3279, grad_fn=<AddBackward0>), accuracy=87.2]
Validate Epoch  #2: 100%|██████████| 389/389 [00:00<00:00, 431.17it/s, loss=tensor(36.6335), accuracy=88.1]
Train Epoch     #3: 100%|██████████| 1554/1554 [00:05<00:00, 293.66it/s, loss=tensor(147.9381, grad_fn=<AddBackward0>), accuracy=88.1]
Validate Epoch  #3: 100%|██████████| 389/389 [00:00<00:00, 433.66it/s, loss=tensor(39.7456), accuracy=88]  
Train Epoch     #4: 100%|██████████| 1554/1554 [00:05<00:00, 295.28it/s, loss=tensor(144.9745, grad_fn=<AddBackward0>), accuracy=88.3]
Validate Epoch  #4: 100%|██████████| 389/389 [00:00<00:00, 437.10it/s, loss=tensor(36.9042), accuracy=88.4]
Train Epoch     #5: 100%|███

In [51]:
print(len(final_validation_input))
print(len(final_validation_output))
print(len(final_validation_target))

24860
24860
24860


In [52]:
file = open("10epoch_output.csv", "w")
file.write("count \t data \t target \t output \n")
count = 0
for item in range(len(final_validation_input)):
    file.write(str(count))
    file.write("\t")
    file.write(str(final_validation_input[item]))
    file.write("\t")
    file.write(str(final_validation_target[item]))
    file.write("\t")
    file.write(str(final_validation_output[item]))
    file.write("\n")
    count += 1
file.close()