In [61]:
import os
import builtins
import argparse
import torch.distributed as dist
import numpy as np
import math
import time
from datetime import datetime
import torch
import torch.nn as nn
import torchvision.transforms as transform
import random
from sklearn import metrics
import matplotlib.pylab as plt
import pandas as pd
import pickle
import glob
from torch.utils.data import Dataset, DataLoader
%matplotlib inline



# Some utility functions
#*************************************
def time_taken(elapsed):
    """To format time taken in hh:mm:ss. Use with time.monotic()"""
    m, s = divmod(elapsed, 60)
    h, m = divmod(m, 60)
    return "%d:%02d:%02d" % (h, m, s)

def mydate() :
    return (datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


# Read/write directory parameters
#*************************************
datadir = 'training_data'
savemodeldir = 'new_model'
loadmodelpath = 'model/2018-10-30_03-12-21_model_epoch30.pth'

# Pytorch parameters
#*************************************
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
savemodel = True
savemodel_interval = 1  #if 0 (and savemodel=True) will only save model at the end of entire training
loadmodel = False

# Training parameters
#*************************************
batch_size = 2
num_epochs = 200
lr = 1e-4
log_interval = 10
random.seed(1234) #for dataset splitting set to None of leave blank if do not need to preserve random order

# Preprocessing parameters
#*************************************
bins = 48
hrange = 24

class CNN(nn.Module):
    # input size - the number of "classes"
    def __init__(self):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv3d(94, 32, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv3d(32, 64, kernel_size=5, stride=1, padding=0),
            #nn.BatchNorm3d(32),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2))
        self.layer3 = nn.Sequential(
            nn.Conv3d(64, 128, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=8, stride=2))
        self.fc0 = nn.Linear(746496,1024)
        self.fc1 = nn.Sequential(
            nn.Linear(1024, 100),
            nn.ReLU(),
            nn.Dropout(0.5))
        self.fc2 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        #print("in",x.shape)
        out = self.layer1(x)
        #print(out.shape)
        out = self.layer2(out)
        #print(out.shape)
        out = self.layer3(out)
        #print(out.shape)
        
        out = out.reshape(out.size(0), -1)
        
        out = self.fc0(out)
        #print(out.shape)
        out = self.fc1(out)
        #print(out.shape)
        out = self.fc2(out)
        out = self.sigmoid(out)
        #print(out.type())
        return out
    
CNN()

class CNNDataLoader(Dataset):
    
    def __init__(self, data):
        """
        Args:
            input_pickle (string): Directory with to pickle file processed tensor data
            master_file (string): Path to the master csv file with annotations. Column 'kd\ki' has labels.
        """

        self.data = data
    
  
                
    def __len__(self):
           return len(self.data)
    
    def __getitem__(self, idx):
        
        grids_path, label_path = self.data[idx]

        with open(label_path,'rb') as f: 
            label = pickle.load(f)
            
        with open(grids_path,'rb') as f: 
            grid = pickle.load(f)
        
        #torch.unsqueeze(grid, dim=0)
        a_grid = grid[0].to_dense() if grid.shape==(1, 200, 200, 200, 94) else grid.to_dense()
        try: a_label = torch.tensor(label[0])
        except: a_label = torch.tensor(label)
        
        return a_grid, a_label


# def collate_fn(data):
#     """
#        data: is a list of tuples with (example, label, length)
#              where 'example' is a tensor of arbitrary shape
#              and label/length are scalars
#     """
#     _, labels, lengths = zip(*data)
#     max_len = max(lengths)
#     n_ftrs = data[0][0].size(1)
#     features = torch.zeros((len(data), max_len, n_ftrs))
#     labels = torch.tensor(labels)
#     lengths = torch.tensor(lengths)

#     for i in range(len(data)):
#         j, k = data[i][0].size(0), data[i][0].size(1)
#         features[i] = torch.cat([data[i][0], torch.zeros((max_len - j, k))])

#     return features.float(), labels.long(), lengths.long()

    
    
    #Make calls to the dataloader
#     for tensor_batch, label_batch in collected_batch:
#         print("Batch of tensors has shape: ", tensor_batch.shape)
#         print("Batch of labels has shape: ", label_batch)

# Define the training cycle (100% teacher forcing for now)
#*************************************
def train(model,epoch, train_loader):
    model.train() #put in training mode
    
    for step, (inp,target) in enumerate(training_loader):
        target = target.float()
        inp, target = inp.to(device), target.to(device)
        inp,target = inp.cuda(), target.cuda()
        inp = inp.view(inp.shape[0],-1,200,200,200)
        
        # Forward + Backward + Optimize
        outputs = model(inp)
        #print(outputs,target)
        loss = criterion(outputs, target)
        #print(loss.item())
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
                
    print ('{:%Y-%m-%d %H:%M:%S} Epoch [{}/{}], Step [{}/{}] Loss: {:.6f}'.format( 
        datetime.now(), epoch+1, num_epochs, step+1, len(train_data)//batch_size, loss.item()))
    
    list_of_losses.append(loss.item())
    
    if args.rank == 0:
        evaluate_mse(model)
                   
    if savemodel_interval != 0 and savemodel:
        if (epoch+1) % savemodel_interval == 0:
            torch.save(model.state_dict(),
                       '{}/{:%Y-%m-%d_%H-%M-%S}_model_epoch{}_step{}.pth'.format(savemodeldir,datetime.now(),epoch+1,step+1))
            print('model saved at epoch{} step{}'.format(epoch+1,step+1))

# Initialize the network, optimizer and objective func
#*************************************
cnn = CNN()
if loadmodel: # load checkpoint if needed
    print("Loading existing checkpoint...")
    cnn.load_state_dict(torch.load(loadmodelpath))
optimizer = torch.optim.Adam(cnn.parameters(), lr=lr)
#criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(299,dtype=torch.float,device=device))  #
criterion = nn.MSELoss() #nn.BCEWithLogitsLoss()  ##nn.MSELoss()




def evaluate(model):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for inp, target in validation_loader:
            inp, target = inp.to(device), target.to(device)
            inp = inp.view(inp.shape[0],-1,200,200,200)

            outputs = model(inp)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == torch.max(target, 1)[1]).sum().item()

        print('Accuracy of the model on the validation set: {} %'.format(100 * correct / total))
        
        
def evaluate_mse(model):
    model.eval()
    with torch.no_grad():
        out = []
        targets = []
        for step, (inp, target) in enumerate(validation_loader):
            inp = inp.to(device)
            inp = inp.view(inp.shape[0],-1,200,200,200)
            outputs = model(inp)
            outputs_numpy = outputs.detach().cpu().numpy()
            targets_numpy = target.numpy()
            for i in range(outputs_numpy.shape[0]):
                out.append(outputs_numpy.item(i))
                targets.append(targets_numpy.item(i))
        print(out)
        print(targets)
        auc = auc_curve(out,targets)
        list_of_auc.append(auc)
            
def auc_curve(output,target):
    """Plot a ROC curve"""
    fpr, tpr, _ = metrics.roc_curve(target,  output)
    auc = metrics.roc_auc_score(target, output)
    plt.figure() 
    plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
    plt.legend(loc=4)
    plt.show()
    plt.savefig('auc.png')
    return(auc)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--net', default='cnn', type=str)
    parser.add_argument('--lr', default=1e-4, type=float, help='learning rate')
    parser.add_argument('--batch_size', default=2, type=int, help='batch size per GPU')
    parser.add_argument('--gpu', default=None, type=int)
    parser.add_argument('--start_epoch', default=0, type=int, 
                        help='start epoch number (useful on restarts)')
    parser.add_argument('--epochs', default=200, type=int, help='number of total epochs to run')
    # DDP configs:
    parser.add_argument('--world-size', default=-1, type=int, 
                        help='number of nodes for distributed training')
    parser.add_argument('--rank', default=-1, type=int, 
                        help='node rank for distributed training')
    parser.add_argument('--dist-url', default='env://', type=str, 
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str, 
                        help='distributed backend')
    parser.add_argument('--local_rank', default=-1, type=int, 
                        help='local rank for distributed training')
    args = parser.parse_args()
    return args


def main(args):
    # DDP setting
    if "WORLD_SIZE" in os.environ:
        args.world_size = int(os.environ["WORLD_SIZE"])
    args.distributed = args.world_size > 1
    ngpus_per_node = torch.cuda.device_count()

    if args.distributed:
        if args.local_rank != -1: # for torch.distributed.launch
            args.rank = args.local_rank
            args.gpu = args.local_rank
        elif 'SLURM_PROCID' in os.environ: # for slurm scheduler
            args.rank = int(os.environ['SLURM_PROCID'])
            args.gpu = args.rank % torch.cuda.device_count()
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)

    # suppress printing if not on master gpu
    if args.rank!=0:
        def print_pass(*args):
            pass
        builtins.print = print_pass
       
    ### model ###
    model = cnn
    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
            model_without_ddp = model.module
        else:
            model.cuda()
            model = torch.nn.parallel.DistributedDataParallel(model)
            model_without_ddp = model.module
    else:
        raise NotImplementedError("Only DistributedDataParallel is supported.")
        
    
    ### resume training if necessary ###
    if args.resume:
        pass
    
    
    torch.backends.cudnn.benchmark = True
    
    os.chdir("/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn/")
    with open("train_data.pkl",'rb') as f: 
        train_data=pickle.load(f)
    with open("validate_data.pkl",'rb') as f: 
        validate_data=pickle.load(f)
    with open("test_data.pkl",'rb') as f: 
        test_data=pickle.load(f)
        
    train_data, validate_data =train_data+validate_data[:5000], validate_data[5000:]     

    train_dataset = CNNDataLoader(train_data)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True)
    
    validate_dataset = CNNDataLoader(validate_data)
    val_sampler = None
    
    test_dataset = CNNDataLoader(test_data)
    
    #Initiate the dataloader
    training_loader = DataLoader(train_dataset,num_workers=10, pin_memory=True, batch_size=2, 
                                 shuffle=(train_sampler is None), sampler=train_sampler)
    validation_loader = DataLoader(validate_dataset,num_workers=10, pin_memory=True, batch_size=2, 
                                   shuffle=(val_sampler is None),sampler=val_sampler)
    testing_loader = DataLoader(test_dataset,num_workers=10, pin_memory=True, batch_size=2, shuffle=True)
    
    
    ### main loop ###
       # Train!
    #*************************************
    train_losses = []
    valid_losses = []
    list_of_auc = []

    print('{:%Y-%m-%d %H:%M:%S} Starting training...'.format(datetime.now()))
    start_time = time.monotonic()
    for epoch in range(num_epochs):

        np.random.seed(epoch)
        random.seed(epoch)
        if args.distributed: 
            train_loader.sampler.set_epoch(epoch)
        train(cnn,epoch, train_loader)
    elapsed_time = time.monotonic() - start_time
    print('Training time taken:',time_taken(elapsed_time))

    if savemodel_interval == 0 and savemodel:
        torch.save(cnn.state_dict(), 
           '{}/{:%Y-%m-%d_%H-%M-%S}_model_epoch{}.pth'.format(savemodeldir,datetime.now(),num_epochs))
        print('model saved at epoch{}'.format(num_epochs))


if __name__ == '__main__':

    
    args = parse_args()
    main(args)

usage: ipykernel_launcher.py [-h] [--net NET] [--lr LR]
                             [--batch_size BATCH_SIZE] [--gpu GPU]
                             [--start_epoch START_EPOCH] [--epochs EPOCHS]
                             [--world-size WORLD_SIZE] [--rank RANK]
                             [--dist-url DIST_URL]
                             [--dist-backend DIST_BACKEND]
                             [--local_rank LOCAL_RANK]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/hmslati/.local/share/jupyter/runtime/kernel-ca943c27-0360-4497-98a8-eeed14edf067.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
import os
import pickle
import math 


os.chdir('/groups/cherkasvgrp/share/progressive_docking/hmslati/plif_cnn')

with open("zero_train_zfeats.pkl",'rb') as f:    #train_data_2.pkl
    train_data=pickle.load(f)

all_labels=[]
for i in train_data:
    with open(i[1],'rb') as f2: all_labels.append(round(-1 * math.log(pickle.load(f2)),1))


In [5]:

import pandas as pd 

df = pd.DataFrame({'target': all_labels})
df['target'] = df['target'].astype('category')
class_count_df = df.groupby(all_labels).count()

In [19]:
class_count_df.iloc[:,0].sum()

9013

In [21]:
    df = pd.DataFrame({'target': all_labels})
    df['target'] = df['target'].astype('category')
    class_count_df = df.groupby(all_labels).count()
    class_weight=[]
    def calc_weight(idx,n_1):
        n_0 = class_count_df.iloc[idx, 0]

        return (n_1) / (272 * n_0)

    for x in range(272):
        class_weight.append(calc_weight(x,class_count_df.iloc[:,0].sum()))

    class_weights=torch.FloatTensor(class_weight)

In [23]:
class_weight[100]

0.35251095118898623

In [25]:
import torch
# Important: Convert Weights To Float Tensor
class_weights=torch.FloatTensor(class_weight)

In [26]:
class_weights

tensor([33.1360, 33.1360, 33.1360, 16.5680, 33.1360, 33.1360, 33.1360, 33.1360,
        33.1360, 16.5680, 16.5680, 11.0453, 16.5680, 16.5680, 16.5680, 16.5680,
        11.0453, 11.0453, 11.0453,  5.5227,  5.5227,  8.2840, 11.0453, 16.5680,
        11.0453, 11.0453,  3.3136,  6.6272,  6.6272,  4.7337,  2.5489,  2.2091,
         6.6272,  1.6568,  6.6272,  3.0124,  8.2840,  1.9492,  3.6818,  8.2840,
         2.7613,  6.6272,  1.8409,  3.6818,  2.0710,  2.0710,  1.2273,  1.5779,
         2.7613,  1.3807,  1.5062,  1.6568,  1.9492,  0.8956,  1.1834,  1.7440,
         0.8720,  1.3254,  1.1426,  1.8409,  1.1426,  0.8720,  1.1426,  0.6762,
         1.0041,  0.7050,  1.1045,  0.8082,  1.5062,  0.7364,  1.0689,  0.8720,
         0.6136,  0.7050,  0.7531,  0.8496,  0.6372,  0.5917,  0.8496,  0.4734,
         0.8720,  0.7364,  0.5813,  0.5021,  0.4303,  0.6497,  0.4303,  0.5523,
         0.3853,  0.5523,  0.5260,  0.6627,  0.5178,  0.4091,  0.6025,  0.4360,
         0.4478,  0.3809,  0.5260,  0.49