In [1]:
import numpy as np 
import os 
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import itertools
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import *
from torch.optim.lr_scheduler import StepLR

from scipy.stats import spearmanr
from scipy import sparse

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, explained_variance_score, r2_score, roc_auc_score, precision_score, f1_score

from modules import *
from models import *

import matplotlib.pyplot as plt
import seaborn as sns

import random
import argparse


  import pandas.util.testing as tm


In [2]:
X_path, y_path = "CSC_repat.csv", "screen_stats_adj.csv"
n_epochs = 50
len_dset = 83539
batch_size = 32
dim1, dim2, dim3, dim4 = 149321, 512, 128, 12
class_weights = [1.5, 5.6, 6.4]


In [5]:
model = FeedforwardBin_notebook(dim1, dim2, dim3, dim4, batch_size)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.05)
criterion = torch.nn.CrossEntropyLoss(weight = torch.tensor(class_weights).cuda())

# res_name = result_name(args.res_dir, args.num_epochs, args.model_class, dim1, dim2, dim3)


In [6]:
train_loss_, val_loss_ = train(X_path, y_path, 
                               len_dset, batch_size, 
                               model, optimizer, 
                              criterion, n_epochs, scheduler, 
                                debug=False)

>>>>>>>Epoch 1
>>>>>>> Training
Epoch 1 iteration 0: train loss: 5.786500930786133
Epoch 1 iteration 2: train loss: 5.734978675842285
Epoch 1 iteration 4: train loss: 5.604947090148926
Epoch 1 iteration 6: train loss: 5.614956378936768
Epoch 1 iteration 8: train loss: 5.154323101043701
Epoch 1 iteration 10: train loss: 5.425844192504883
Epoch 1 iteration 12: train loss: 5.437110900878906
Epoch 1 iteration 14: train loss: 5.526227951049805
Epoch 1 iteration 16: train loss: 5.48551082611084
Epoch 1 iteration 18: train loss: 4.7744460105896
Epoch 1 iteration 20: train loss: 5.272607326507568
Epoch 1 iteration 22: train loss: 4.708881378173828
Epoch 1 iteration 24: train loss: 5.768329620361328
Epoch 1 iteration 26: train loss: 4.865133762359619
Epoch 1 iteration 28: train loss: 5.172720909118652


KeyboardInterrupt: 

In [119]:
CSC_data = pd.read_csv("CSC_repat.csv")
y_train = pd.read_csv("screen_stats_adj.csv")
y_train = y_train.replace(-1, 2)


class_weighs = ((len(y_train)/y_train.iloc[:, 1].value_counts()).values)

CSC_data = CSC_data.iloc[:, 1:]
# y_train = y_train.iloc[:, 1:].values


In [160]:
class Dset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X_path, y_path, dset_len, chunksize):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.chunksize = chunksize
        self.X_path = X_path
        self.y_path = y_path
        self.dset_len = dset_len
        self.reader_X = pd.read_csv(self.X_path) #, chunksize = self.chunksize) #, iterator=True)
        self.reader_y = pd.read_csv(self.y_path, chunksize = self.chunksize) #, iterator=True)

    def __len__(self):
        return self.dset_len

    def __getitem__(self, idx):
        self.X = self.get_batch(self.reader_X, self.chunksize, idx)
        self.y = self.reader_y.get_chunk(self.chunksize)
        self.y = self.y.iloc[:, 1:].replace(-1, 2).values
        
        return torch.tensor(self.X), torch.tensor(self.y)
    
    def get_batch(self, reader_X, chunksize, idx):
        X_batch = reader_X[(reader_X.iloc[:, 0] >= chunksize*idx) & (reader_X.iloc[:, 0] < chunksize*(idx + 1))]
        indices = X_batch.iloc[:, 0].values - chunksize*idx
        indptr = X_batch.iloc[:, 1].values
        data = X_batch.iloc[:, 2].values
        mtx = sparse.csc_matrix((data, (indices, indptr)), shape=(chunksize, 149321)).toarray()
        return mtx


In [111]:
# chunksize = 32
# for idx in [0, 1, 2, 3]:
#     X_batch = reader_X[(reader_X.iloc[:, 0] >= chunksize*idx) & (reader_X.iloc[:, 0] < chunksize*(idx + 1))]
#     indices = X_batch.iloc[:, 0].values - chunksize*idx
#     indptr = X_batch.iloc[:, 1].values
#     data = X_batch.iloc[:, 2].values
    
#     mtx = sparse.csc_matrix((data, (indices, indptr)), shape=(chunksize, 149321)).toarray()

#     print (mtx)

[0 1]
[0 1]
[0 1]
[0 1]


In [200]:
custom_dset = Dset("CSC_repat.csv", "screen_stats_adj.csv", 83539, batch_size)

In [203]:
def get_train_val_loader(X_path, y_path, len_dset, batch_size):
    custom_dset = Dset(X_path, y_path, len_dset, batch_size)
    dataset_indices = list(range(len(custom_dset)))
    np.random.shuffle(dataset_indices)
    val_split_index = int(np.floor(0.2 * len(dataset_indices)))

    train_idx, val_idx = dataset_indices[val_split_index:], dataset_indices[:val_split_index]
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)

    train_loader = DataLoader(dataset=custom_dset, batch_size=1, shuffle=False, sampler=train_sampler)
    val_loader = DataLoader(dataset=custom_dset, batch_size=1, shuffle=False, sampler=val_sampler)
    return train_loader, val_loader

In [4]:
class FeedforwardBin(torch.nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, batch_size):
        super(FeedforwardBin_notebook, self).__init__()
        self.input_size = input_size
        self.hidden_size1  = hidden_size1
        self.hidden_size2  = hidden_size2
        self.hidden_size3 = hidden_size3
        self.batch_size = batch_size
        
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size1)
        self.tahn1 = torch.nn.Tanh()
        self.fc2 = torch.nn.Linear(self.hidden_size1, self.hidden_size2)
        self.tahn2 = torch.nn.Tanh()            
        self.fc3 = torch.nn.Linear(self.hidden_size2, self.hidden_size3)
        self.tahn3 = torch.nn.Tanh()            
#         self.fc4 = torch.nn.Linear(self.hidden_size3, 2)
        self.batchnorm1 = nn.BatchNorm1d(self.hidden_size1)
        self.dropout1 = nn.Dropout(p=0.25)
        self.dropout2 = nn.Dropout(p=0.25)
        self.softmax = nn.LogSoftmax(dim=2)


        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.zeros_(self.fc1.bias)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.zeros_(self.fc2.bias)
        torch.nn.init.xavier_uniform_(self.fc3.weight)
        torch.nn.init.zeros_(self.fc3.bias)



    def forward(self, x):
        hidden1 = self.fc1(x)
        hidden1 = self.batchnorm1(hidden1)
        tahn1 = self.tahn1(hidden1)
        tahn1 = self.dropout1(tahn1)
        hidden2 = self.fc2(tahn1)     
        tahn2 = self.tahn2(hidden2)
        tahn2 = self.dropout2(tahn2)        
        fc3 = self.fc3(tahn2)
        fc3 = fc3.view(self.batch_size, 4, 3)

        
#         tahn3 = self.tahn3(fc3)
#         output = self.fc4(tahn3)

#         output = self.softmax(fc3)
        return fc3



In [204]:
def do_eval(val_loader, model, criterion, epoch):
    loss_ = []
    a = time()
    model.eval()
    monitor_step = 10
    for i, data in enumerate(val_loader):
        X, y = data
        X, y = X.cuda(), y.cuda()
        X = X.reshape(X.shape[1], X.shape[-1]).float()
        y = y.reshape(y.shape[1], y.shape[-1]).long()
        
        y_pred = model(X)
#         loss = criterion(y_pred, y)
        loss = 0
        for i in range(0, 4):
            loss__ = criterion(y_pred[:, i, :], y[:, i])
            loss += loss__
        loss_ = np.append(loss_, loss.item())
#         pres, f1 = calc_pres(y, y_pred)
        loss_ = np.array(loss_)  

        if i % monitor_step == 0: 
            print('Iteration {}: val loss: {} precision {}, f-1 {}'.format(i, loss.item(), pres, f1))
    s = '%d: Val loss:%f, MSE: N samples: %d in %f min'%(epoch, loss_.mean(), len(loss_), (time() -a)/60.)
    print(s)
    return loss_


In [206]:
def train(X_path, y_path, len_dset, batch_size, model, optimizer, criterion, n_epochs, scheduler, debug=False):
    model.cuda()
    model.train()
    a = time()
    epoch = 0
    monitor_step = 2
    val_loss_, train_loss_, pres_ = [], [], []
    train_loader, val_loader = get_train_val_loader(X_path, y_path, len_dset, batch_size)
    for e in range(n_epochs):
        epoch = e+1
        print('>>>>>>>Epoch %d'%(epoch))
        print(">>>>>>> Training")
        for i, data in enumerate(train_loader):
            X_tr, y_tr = data
            X_tr = X_tr.reshape(X_tr.shape[1], X_tr.shape[-1]).float().cuda()
            y_tr = y_tr.reshape(y_tr.shape[1], y_tr.shape[-1]).long().cuda()


            y_pred = model(X_tr)

            if debug:
                print ('X', X_tr.shape)
                print ('y_tr', y_tr.shape)
                print ('y_pred', y_pred.shape)
            
            loss = 0
            for j in range(0, 4):
                loss_ = criterion(y_pred[:, j, :], y_tr[:, j])
                loss += loss_
                
#             pres, f1 = calc_pres(y_tr, y_pred)


            train_loss_.append(loss.item())
#             pres_.append(pres)
            if i % monitor_step == 0: 
#                 print('Epoch {} iteration {}: train loss: {} precision {} f1 {}'.format(epoch, i, loss.item(), pres, f1))
                print('Epoch {} iteration {}: train loss: {}'.format(epoch, i, loss.item()))

            loss.backward()
            optimizer.step()

        val_loss = do_eval(val_loader, model, criterion, epoch)
        val_loss_.append(val_loss)
        if scheduler is not None: 
            scheduler.step()

    print('%d: Train time:%.2f min in %d steps'%(epoch, (time() - a)/60, n_iter))
#     model_save(model, optimizer, n_epochs, train_loss_, val_loss_, res_name)
    return train_loss_, val_loss_
