In [1]:
import torch
from torch import nn

from torch.optim import lr_scheduler

from torch.utils.data import random_split,Dataset,DataLoader

import torch.nn.functional as F
import torch.nn.init as init

from momentumnet import MomentumNet
from momentumnet import transform_to_momentumnet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from joblib.externals.loky.backend.context import get_context

import time
import copy
import random
import pickle
import tarfile

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
class MyDataset(Dataset):
    def __init__(self,gene_matrix,cell_type):
        
        self.gene_matrix = torch.from_numpy(gene_matrix).float()
        self.cell_type = torch.from_numpy(cell_type).squeeze(1)
    

    def __len__(self):
        
        return self.gene_matrix.shape[0]

    def __getitem__(self,idx):
        
        data = (self.gene_matrix[idx],self.cell_type[idx])
        
        return data

In [3]:
def create_pbmc1_loader(pbmc_path,cell_type_path,batch_size=128):

    pbmc = pd.read_csv(pbmc_path,header=None)
    cell_type = pd.read_csv(cell_type_path,index_col = 0)
    
    full_dataset = MyDataset(pbmc.values,cell_type.values)

    #Random split(0.7,0.3)
    train_size = int(0.7 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

    # Define DataLoaders
    # use 'loky' to work with joblib
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,   batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader


In [4]:
def _weights_init(m):
    classname = m.__class__.__name__
    #print(classname)
    if isinstance(m, nn.Linear):
        init.normal(m.weight)

In [5]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        self.lr1 = nn.Linear(3500,1000)
        self.lr2 = nn.Linear(1000,128)
        self.lr3 = nn.Linear(128,64)
        
        self.lr = nn.Linear(64,9)
        
        self.apply(_weights_init)
        
    def forward(self,x):
        x = F.relu(self.lr1(x))
        x = self.lr2(x)
        x = F.relu(self.lr3(x))
        
        output = self.lr(x)
        
        return output

In [6]:
def SGD_training(network, SGD_steps, lr, momentum, data_loader):

    network.to(device)
    network.train()

    # Create optimizer and criterion
    criterion = nn.CrossEntropyLoss(reduction='mean')
    
    optimizer = torch.optim.SGD(network.parameters(), lr=lr, momentum=momentum, nesterov=True, weight_decay=0.0001)
    
    total_step = len(data_loader)
    
    for s in range(0, SGD_steps):
        total_loss = 0
        for i, (genes, types) in enumerate(data_loader): 
            
            genes = genes.to(device)
            types = types.to(device)
            
            # Forward pass
            outputs = network(genes) 
            
            loss = criterion(outputs, types)
            total_loss += loss.item()
            # Backward and optimize
            optimizer.zero_grad()
            
            loss.backward()
            
            #Gradient Value Clipping
            nn.utils.clip_grad_value_(network.parameters(), clip_value=1.0)
            
            optimizer.step()
            
            del loss, outputs
            
            #if (i+1) % 100 == 0:
        total_loss = total_loss / len(data_loader.sampler)
        print ("Epoch [{}/{}], Loss: {}".format(s+1, SGD_steps, total_loss))
        
            
            
    
    # move network back to cpu and return
    network.cpu()
    
    return network

In [7]:
# Hyperparameters
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
batch_size = 128
SGD_steps = 100
learning_rate = 1e-5
pbmc_1 = "data/pbmc_1_pca.csv"
cell_type_pbmc1 = "data/cell_type_pbmc1.csv"
pbmc_2 = "data/pbmc_2_pca.csv"
cell_type_pbmc2 = "data/cell_type_pbmc2.csv"

In [8]:
train_loader, test_loader = create_pbmc1_loader(pbmc_1,cell_type_pbmc1,batch_size = batch_size)

In [9]:
network = MLP()

In [10]:
SGD_start = time.time()
network = SGD_training(network, SGD_steps, learning_rate, 0.9, train_loader)
SGD_end = time.time()
print(f"All Time{(SGD_start-SGD_end)*1000}ms")

Epoch [1/100], Loss: 1969.9704974489796
Epoch [2/100], Loss: 1611.7593112244897
Epoch [3/100], Loss: 1332.4163105867347
Epoch [4/100], Loss: 1022.3392665816326
Epoch [5/100], Loss: 820.0230420918367
Epoch [6/100], Loss: 671.408977997449
Epoch [7/100], Loss: 585.602558992347
Epoch [8/100], Loss: 517.3997114158163
Epoch [9/100], Loss: 476.0442681760204
Epoch [10/100], Loss: 421.7506560905612
Epoch [11/100], Loss: 399.5551961096939
Epoch [12/100], Loss: 366.1506138392857
Epoch [13/100], Loss: 358.73410235969385
Epoch [14/100], Loss: 332.04303491709186
Epoch [15/100], Loss: 308.64644451530614
Epoch [16/100], Loss: 285.6639453125
Epoch [17/100], Loss: 269.14242426658166
Epoch [18/100], Loss: 259.9831473214286
Epoch [19/100], Loss: 260.7489166135204
Epoch [20/100], Loss: 244.63893654336735
Epoch [21/100], Loss: 223.9440983737245
Epoch [22/100], Loss: 212.1482401945153
Epoch [23/100], Loss: 199.76894690688775
Epoch [24/100], Loss: 190.43463966836734
Epoch [25/100], Loss: 183.2241932397959
Epo

In [11]:
# Test the model
def test_function(network, data_loader):
    # init accuracy
    accuracy = 0.0
    
    network.to(device)
    network.eval()
    
    with torch.no_grad():
        correct = 0
        total = 0
        for genes, types in data_loader:
            genes = genes.to(device)
            types = types.to(device)
            outputs = network(genes)
            _, predicted = torch.max(outputs.data, 1)
            
            total += genes.size(0)
            
            correct += (predicted == types).sum().item()

        accuracy =  correct / total
        #print('Accuracy of the model on the test images: {} %'.format(accuracy))
        
    # send network back to cpu
    network.cpu()
    
    return accuracy

In [12]:
test_accuracy = test_function(network,test_loader)
test_accuracy

0.6752380952380952