In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install bpemb

Collecting bpemb
  Downloading https://files.pythonhosted.org/packages/bc/70/468a9652095b370f797ed37ff77e742b11565c6fd79eaeca5f2e50b164a7/bpemb-0.3.0-py3-none-any.whl
Collecting sentencepiece (from bpemb)
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 10.9MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.0 sentencepiece-0.1.83


In [1]:
__author__ = 'Gohur Ali'
import numpy as np
import os               # FileSystem Access
import yaml             # Config File Access
from tqdm import tqdm   # Progress Visualization
import time
import argparse
import json
import sys
import pickle
import re
import codecs
from bpemb import BPEmb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import torch
import torch.utils.data
import torch.nn.functional as F
import pandas as pd
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
torch.__version__

'1.1.0'

In [3]:
# CUDA for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device state:\t", device)
print("Device index:\t",torch.cuda.current_device())
print("Current device:\t", torch.cuda.get_device_name(device))

Device state:	 cuda
Device index:	 0
Current device:	 Tesla K80


In [4]:
cfg = yaml.safe_load(open('/content/drive/My Drive/College/Undergraduate Research/SkillEvaluation/bpe_config.yaml'))
for k,v in cfg.items():
    print(k,':\t',v)

csv_data_loc :	 dataset/datajobposts.csv
col_to_parse :	 JobDescription
records_loc :	 outputs/records/
use_pretrained :	 True
pretrained_model_loc :	 models/wiki.en/wiki.en.bin
gen_model :	 True
pad_limit :	 150
model_type :	 shallow
embedding_dim :	 300
epochs :	 2
optimizer :	 adam
learning_rate :	 0.0001
beta1 :	 0.9
beta2 :	 0.999
epsilon :	 10e-8
batch_size :	 8
dataset :	 subj
train_data_location :	 /content/drive/My Drive/College/Undergraduate Research/SkillEvaluation/datasets/
keras_tokenizer_loc :	 models/
if_softmax :	 False
num_classes :	 6
train_test_split_ratio :	 0.2
validation_split_ratio :	 0.0
early_stopping :	 True
using_cv :	 False
cross_validation_k :	 10
create_logs :	 False
logs_loc :	 /outputs/tensorboard_logs/
create_checkpoints :	 False
checkpoint_loc :	 models/skillnet_checkpoints/
model_loc :	 models/skillnet_checkpoints/initial_architecture/
use_file_input :	 yaml


In [0]:
class DataPrepper():
    def __init__(self,config={},dataset=None):
        self.config = config
        self.dataset_type = dataset

        if(self.dataset_type == 'trec'):
            self.x_train, self.y_train, self.x_test, self.y_test = self.read_trec_dataset(
                train_data_location=self.config['train_data_location']+self.config['dataset']+'/',
                use_default_split=True
            )
        elif(self.dataset_type == 'subj'):
            dataset,labels = self.read_subj_dataset(train_data_location=self.config['train_data_location']+self.config['dataset']+'/')
            self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
                dataset,
                labels, 
                test_size=0.2, 
                random_state=1000
            )
        pass
    
    def read_subj_dataset(self, train_data_location):
        """Open and prepare the subjectivity dataset. Using
        Regular expressions to clean the sentences.
        
        Args:
            train_data_location - location of the data, specified in config
        Return:
            dataset - ndarray of each example
            labels - array of binary labels
        """
        dataset = []
        labels = []
        for f in os.listdir(train_data_location):
            print(f)
            if(f == 'quote.tok.gt9.5000'):
                # Subjective Data
                with open(train_data_location + f, encoding = "ISO-8859-1") as subj_file:
                    for line in subj_file:
                        pattern = "[^a-zA-Z.' ]"
                        cleaned_line = re.sub(pattern,' ',line)
                        dataset.append(cleaned_line)
                        labels.append(0)
            elif(f == 'plot.tok.gt9.5000'):
                # Objective Data
                with open(train_data_location + f, encoding = "ISO-8859-1") as obj_file:
                    for line in obj_file:
                        pattern = "[^a-zA-Z.' ]"
                        cleaned_line = re.sub(pattern,' ',line)
                        dataset.append(cleaned_line)
                        labels.append(1)
        return np.array(dataset), np.array(labels)
    
    def read_trec_dataset(self, train_data_location, use_default_split=False):
        """Open and prepare the subjectivity dataset. Using
        Regular expressions to clean the sentences.
        
        Args:
            train_data_location - location of the data, specified in config
        Return:
            dataset - ndarray of each example
            labels - array of binary labels
        """
        
        if(use_default_split == False):
            dataset = []
            labels = []
            for f in os.listdir(train_data_location):
                print(f)
                if(f == 'trec_5000_train.txt'):
                    # Subjective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as subj_file:
                        for line in subj_file:
                            split_line = line.split(':')
                            ques_class = split_line[0]
                            question = split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            dataset.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                labels.append(0)
                            elif(ques_class == 'DESC'):
                                labels.append(1)
                            elif(ques_class == 'ENTY'):
                                labels.append(2)
                            elif(ques_class == 'HUM'):
                                labels.append(3)
                            elif(ques_class == 'ABBR'):
                                labels.append(4)
                            elif(ques_class == 'LOC'):
                                labels.append(5)
                elif(f == 'trec_test.txt'):
                    # Objective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as obj_file:
                        for line in obj_file:
                            split_line = line.split(': ')
                            ques_class = split_line[0]
                            question = split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            dataset.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                labels.append(0)
                            elif(ques_class == 'DESC'):
                                labels.append(1)
                            elif(ques_class == 'ENTY'):
                                labels.append(2)
                            elif(ques_class == 'HUM'):
                                labels.append(3)
                            elif(ques_class == 'ABBR'):
                                labels.append(4)
                            elif(ques_class == 'LOC'):
                                labels.append(5)
            return np.array(dataset), np.array(labels)
        elif(use_default_split==True):
            x_train = []
            x_test = []
            y_train = []
            y_test = []
            for f in os.listdir(train_data_location):
                print(f)
                if(f == 'trec_5000_train.txt'):
                    # Subjective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as subj_file:
                        for line in subj_file:
                            split_line = line.split(':')#
                            ques_class = split_line[0]
                            question = line.split(' ',1)[1]#split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            x_train.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                y_train.append(0)
                            elif(ques_class == 'DESC'):
                                y_train.append(1)
                            elif(ques_class == 'ENTY'):
                                y_train.append(2)
                            elif(ques_class == 'HUM'):
                                y_train.append(3)
                            elif(ques_class == 'ABBR'):
                                y_train.append(4)
                            elif(ques_class == 'LOC'):
                                y_train.append(5)
                elif(f == 'trec_test.txt'):
                    # Objective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as obj_file:
                        for line in obj_file:
                            split_line = line.split(':')#line.split(' ',1)
                            ques_class = split_line[0]
                            question = line.split(' ',1)[1]#split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            x_test.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                y_test.append(0)
                            elif(ques_class == 'DESC'):
                                y_test.append(1)
                            elif(ques_class == 'ENTY'):
                                y_test.append(2)
                            elif(ques_class == 'HUM'):
                                y_test.append(3)
                            elif(ques_class == 'ABBR'):
                                y_test.append(4)
                            elif(ques_class == 'LOC'):
                                y_test.append(5)
            return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

In [0]:
class ShallowCNN(torch.nn.Module):
    def __init__(self,config={},pretrained_embeddings=None):
        super(ShallowCNN,self).__init__()
        #self.cfg = yaml.safe_load(open('config.yaml'))
        self.cfg = config
        self.num_classes = self.cfg['num_classes']
        
        # -- Build Embedding Table --
        self.pretrained_embedding_table = torch.nn.Embedding.from_pretrained(pretrained_embeddings)
        
#         self.pretrained_embedding_table = torch.nn.Embedding(
#                        num_embeddings=len(pretrained_embeddings), 
#                        embedding_dim=self.cfg['embedding_dim']
#         )
#         self.pretrained_embedding_table.weight = torch.nn.Parameter(pretrained_embeddings)
        
        # -- Define Architecture --
        self.conv1 = torch.nn.Conv1d(in_channels=self.cfg['pad_limit'],
                                     out_channels=400,
                                     kernel_size=(4,),
                                     stride=1,
                                     padding=0,
                                     bias=True
                                    )
        self.mp1 = torch.nn.MaxPool1d(kernel_size=2,
                                      stride=1,
                                      padding=0
                                     )
        self.fc1 = torch.nn.Linear(in_features=38400,#118400,#self.cfg['embedding_dim'] - 4,
                                       out_features=128, 
                                       bias=True
                                      )
        if(self.cfg['if_softmax']):
            self.fc2 = torch.nn.Linear(in_features=128,
                                       out_features=self.cfg['num_outputs'],
                                       bias=True
                                      )
        else:
            self.fc2 = torch.nn.Linear(in_features=128,
                                       out_features=1,
                                       bias=True
                                      )
    def forward(self, inputs):
        """Forward pass definition
        Args:
            inputs - Array of indices for embeddings lookup
        """
        emb = self.pretrained_embedding_table(inputs)
        x = F.leaky_relu(self.conv1(emb))
        x = self.mp1(x)
        x = x.view(x.shape[0],-1)
        #print('Flatten = ', x.shape)
        
        x = F.dropout(F.leaky_relu(self.fc1(x)))
        if(self.cfg['if_softmax']):
            x = self.fc2(x)
            x = F.log_softmax(x,dim=1,dtype=torch.float)
        else:
            x = self.fc2(x)
            x = torch.sigmoid(x)
        return x


In [0]:
class Trainer:
    def __init__(self,config={},DataPrepper=None):
        self.cfg = config
        self.dataprepper = DataPrepper
        self.bpe_model, self.embeddings = self.open_bpe_vectors()

        self.x_train = self.bpe_model.encode_ids(self.dataprepper.x_train)
        self.x_test = self.bpe_model.encode_ids(self.dataprepper.x_test)
        self.x_train = pad_sequences(sequences=self.x_train,maxlen=self.cfg['pad_limit'])
        self.x_test = pad_sequences(sequences=self.x_test, maxlen=self.cfg['pad_limit'])         
        self.y_train = self.dataprepper.y_train.reshape((self.dataprepper.y_train.shape[0],1))
        self.y_test = self.dataprepper.y_test.reshape((self.dataprepper.y_test.shape[0],1))

        self.train_idx_labels = self.y_train
        self.test_idx_labels = self.y_test

        if(self.cfg['if_softmax']):
            self.y_train = self.to_categorical(self.y_train, self.cfg['num_classes'])
            self.y_test = self.to_categorical(self.y_test, self.cfg['num_classes'])
        
        print('Train data size: x_train = {',self.x_train.shape,'} -- y_train = {',self.y_train.shape,'}')
        print('Test data size: x_test = {',self.x_test.shape,'} -- y_test = {',self.y_test.shape,'}')

        self.train_dataloader,self.test_dataloader = self.create_dataloaders(
            train_data=(self.x_train,self.y_train),
            test_data=(self.x_test,self.y_test)
            )
        pass
        
    def sequence_examples(self, dataset):
        sequenced_dataset = []
        for example in tqdm(dataset):
            sequenced_sentence = []
            words = example.split()
            for word in words:
                if(word in self.w2e.keys()):
                    idx = self.w2e[word][0]
                    sequenced_sentence.append(idx)
                else:
                    idx = self.w2e['_unk'][0]
                    sequenced_sentence.append(idx)
            sequenced_dataset.append(sequenced_sentence)
        return sequenced_dataset
      
    def create_dataloader(self, features, labels):
        print('-- Batch size ',self.cfg['batch_size'],'--')
        dataset = torch.utils.data.TensorDataset(features, labels)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=self.cfg['batch_size'], shuffle=True)
        return data_loader
    
    def to_categorical(self, y, num_classes):
        """ 1-hot encodes a tensor """
        return np.eye(num_classes, dtype='uint8')[y]              
        
    def get_trec_dataset(self, train_data_location, use_default_split=False):
        """Open and prepare the subjectivity dataset. Using
        Regular expressions to clean the sentences.
        
        Args:
            train_data_location - location of the data, specified in config
        Return:
            dataset - ndarray of each example
            labels - array of binary labels
        """
        
        if(use_default_split == False):
            dataset = []
            labels = []
            for f in os.listdir(train_data_location):
                print(f)
                if(f == 'trec_5000_train.txt'):
                    # Subjective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as subj_file:
                        for line in subj_file:
                            split_line = line.split(':')
                            ques_class = split_line[0]
                            question = split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            dataset.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                labels.append(0)
                            elif(ques_class == 'DESC'):
                                labels.append(1)
                            elif(ques_class == 'ENTY'):
                                labels.append(2)
                            elif(ques_class == 'HUM'):
                                labels.append(3)
                            elif(ques_class == 'ABBR'):
                                labels.append(4)
                            elif(ques_class == 'LOC'):
                                labels.append(5)
                elif(f == 'trec_test.txt'):
                    # Objective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as obj_file:
                        for line in obj_file:
                            split_line = line.split(': ')
                            ques_class = split_line[0]
                            question = split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            dataset.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                labels.append(0)
                            elif(ques_class == 'DESC'):
                                labels.append(1)
                            elif(ques_class == 'ENTY'):
                                labels.append(2)
                            elif(ques_class == 'HUM'):
                                labels.append(3)
                            elif(ques_class == 'ABBR'):
                                labels.append(4)
                            elif(ques_class == 'LOC'):
                                labels.append(5)
            return np.array(dataset), np.array(labels)
        elif(use_default_split==True):
            x_train = []
            x_test = []
            y_train = []
            y_test = []
            for f in os.listdir(train_data_location):
                print(f)
                if(f == 'trec_5000_train.txt'):
                    # Subjective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as subj_file:
                        for line in subj_file:
                            split_line = line.split(':')#
                            ques_class = split_line[0]
                            question = line.split(' ',1)[1]#split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            x_train.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                y_train.append(0)
                            elif(ques_class == 'DESC'):
                                y_train.append(1)
                            elif(ques_class == 'ENTY'):
                                y_train.append(2)
                            elif(ques_class == 'HUM'):
                                y_train.append(3)
                            elif(ques_class == 'ABBR'):
                                y_train.append(4)
                            elif(ques_class == 'LOC'):
                                y_train.append(5)
                elif(f == 'trec_test.txt'):
                    # Objective Data
                    with open(train_data_location + f, encoding = "ISO-8859-1") as obj_file:
                        for line in obj_file:
                            split_line = line.split(':')#line.split(' ',1)
                            ques_class = split_line[0]
                            question = line.split(' ',1)[1]#split_line[1]
                            pattern = "[^a-zA-Z.' ]"
                            cleaned_line = re.sub(pattern,' ',question)
                            cleaned_line = cleaned_line.lower()
                            x_test.append(cleaned_line)
                            if(ques_class == 'NUM'):
                                y_test.append(0)
                            elif(ques_class == 'DESC'):
                                y_test.append(1)
                            elif(ques_class == 'ENTY'):
                                y_test.append(2)
                            elif(ques_class == 'HUM'):
                                y_test.append(3)
                            elif(ques_class == 'ABBR'):
                                y_test.append(4)
                            elif(ques_class == 'LOC'):
                                y_test.append(5)
            return x_train, y_train, x_test, y_test
          
    def open_pretrained(self):
        """Getting GloVe Embeddings to be used for embedding
        layer. Corresponding words to be feature hashed for look
        up.
        Returns
            NumPy Tensor of shape (300,)
        """
        embeddings = []
        glove_w2emb = {}
        glove_embeddings_file = open(os.path.join('/content/drive/My Drive/College/Undergraduate Research/SkillEvaluation/','glove.6B.'+str(self.cfg['embedding_dim'])+'d.txt'))
        
        # -- Padding --
        glove_w2emb['_pad'] = (0, None)
        
        # -- OOV Words --
        unk_words = np.random.rand(self.cfg['embedding_dim'],)
        glove_w2emb['_unk'] = (1, unk_words)
        embeddings.append(unk_words)
        
        idx = 2
        for line in tqdm(glove_embeddings_file):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            glove_w2emb[word] = (idx , coefs)
            embeddings.append(coefs)
            idx+=1
        glove_embeddings_file.close()
        return glove_w2emb, embeddings
    
    def open_bpe_vectors(self):
        en_model = BPEmb(lang='en',vs=200000,dim=100)
        return en_model, en_model.vectors
        
    
    def build_embedding_table(self, mapping):
        table = np.zeros((len(mapping), self.cfg['embedding_dim']))
        for word, value in mapping.items():
            if(value[1] is not None):
                table[value[0]] = value[1]
        return table
    
    
    def split_data(self,examples,labels):
        if(self.use_default_split == False):
            cfg_split_ratio = self.cfg['train_test_split_ratio']
            x_train, x_test, y_train, y_test = train_test_split(self.examples, self.labels, test_size=cfg_split_ratio, random_state=1000)
            return x_train,x_test,y_train,y_test

    def create_dataloaders(self,train_data,test_data):
        x_train = train_data[0]
        y_train = train_data[1]

        x_test = test_data[0]
        y_test = test_data[1]

        if(str(device) == 'cuda'):
            x_train = torch.tensor(x_train).to(device)#.cuda()
            y_train = torch.tensor(y_train,dtype=torch.long).to(device)#.cuda()
            x_test = torch.tensor(x_test).to(device)#.cuda()
            y_test = torch.tensor(y_test,dtype=torch.long).to(device)#.cuda()
        else:
            x_train = torch.tensor(x_train)
            y_train = torch.tensor(y_train,dtype=torch.long)
            x_test = torch.tensor(x_test)
            y_test = torch.tensor(y_test,dtype=torch.long)

        train_dataloader = self.create_dataloader(features=x_train, labels=y_train)
        test_dataloader = self.create_dataloader(features=x_test, labels=y_test)
        return train_dataloader,test_dataloader

    
    def build_model(self, embeddings):
        return ShallowCNN(self.cfg,embeddings)
    
    def train(self,train_data):

        epochs = 75 # self.cfg['epochs']
        learning_rate = 0.0001 #self.cfg['learning_rate'])
        
        # -- Create Model --
        self.model = self.build_model(torch.tensor(self.embeddings))
        print(self.model)

        # -- Model to CUDA GPU --
        if( str(device) == 'cuda'):
            print('Sending model to',torch.cuda.get_device_name(device),' GPU')
            #model = model.cuda()
            self.model.to(device)

        optimizer = torch.optim.Adam(self.model.parameters(),lr=learning_rate)
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,
        #                                             step_size=50,
        #                                             gamma=0.1)

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.1,patience=5,
            verbose=True,threshold=0.0001, threshold_mode='rel', 
            cooldown=0,min_lr=0,eps=1e-08
            )
        loss_function = None
        
        if(self.cfg['if_softmax']):
            #loss_function = torch.nn.CrossEntropyLoss()
            loss_function = torch.nn.NLLLoss()
        else:
            loss_function = torch.nn.BCELoss()

        
        accuracy = 0
        losses = []
        for epoch in range(epochs):
            total_loss = 0
            loss = 0
            correct = 0
            for i , (examples, labels) in enumerate(train_data):
                if( self.cfg['if_softmax']):
                    labels_n = labels.cpu().numpy()
                    labels_idx = np.argwhere(labels_n >0)
                    labels_idx = labels_idx.T
                    labels_idx = np.delete(labels_idx,0,0).T
                    labels_idx = np.squeeze(labels_idx,1)
                    labels_idx = torch.tensor(labels_idx,dtype=torch.int)
                    #print(labels_idx)

                    # Transfer to GPU
                    if(str(device) == 'cuda'):
                        examples = examples.to(device)
                        labels = labels.to(device)
                        labels_idx = labels_idx.to(device)
                    
                    self.model.zero_grad()

                    predictions = self.model(examples.long())
                    loss = loss_function(predictions,labels_idx.long())
                    
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                else:
                    if(str(device) == 'cuda'):
                        examples = examples.to(device)
                        labels = labels.to(device)
                    self.model.zero_grad()

                    predictions = self.model(examples.long())
                    if(str(device) == 'cuda'):
                        predictions = predictions.to(device)
                    loss = loss_function(predictions.float(),labels.float())

                    preds = np.round(predictions.float().cpu().detach())
                    labels = labels.float().cpu().detach()
                    correct += (preds == labels).sum()
                    
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                #break
            scheduler.step(total_loss) 
            losses.append(total_loss)
            #break
            accuracy = correct.float()/self.x_train.shape[0] * 100
            #print('Epoch {} ----> loss={} accuracy={}'.format(epoch,total_loss,accuracy))
            print(f'Epoch {epoch} ----> loss={total_loss:.5f} accuracy={accuracy:.5f}')
            #print('Epoch {} Learning_Rate{} ----> loss={}'.format(epoch,scheduler.get_lr(),total_loss))
            print('==========================================================')
        return self.model, loss_function, losses
    
    def test_validate(self,debug=False,model=None,test_data=[],loss_fn=None):
        test_loss = 0
        correct = 0
        all_predictions = []
        for idx,(examples, labels) in enumerate(test_data):
            if( self.cfg['if_softmax']):
                labels_n = labels.cpu().numpy()
                labels_idx = np.argwhere(labels_n >0)
                labels_idx = labels_idx.T
                labels_idx = np.delete(labels_idx,0,0).T
                labels_idx = np.squeeze(labels_idx,1)
                labels_idx = torch.tensor(labels_idx,dtype=torch.int)
                if(str(device) == 'cuda'):
                    examples = examples.to(device)
                    labels = labels.to(device)
                    labels_idx = labels_idx.to(device)

                outputs = self.model.forward(examples.long())

                preds = []
                for pred in outputs:
                    #preds.append((torch.max(pred).detach(),np.argmax(pred.cpu().detach().numpy())))
                    preds.append(np.argmax(pred.cpu().detach().numpy()))
                preds = torch.tensor(preds,dtype=torch.int).to(device)
                
                all_predictions.append(outputs)
                loss = loss_fn(outputs, labels_idx.long())
                test_loss += loss.item()

                correct += (preds == labels_idx).sum() 

                if(debug):
                    for ex,label,label_idx,pred,pred_idx in zip(examples,labels,labels_idx,outputs,preds):
                        print('{}: actual = {} ---> pred = {}'.format(idx,label_idx.item(),pred_idx.item()))
            else:
                if(str(device) == 'cuda'):
                    examples = examples.to(device)
                    labels = labels.to(device)
                outputs = self.model.forward(examples.long())
                all_predictions.append(outputs)
                loss = loss_fn(outputs.float(), labels.float())
                test_loss += loss.item()


                preds = np.round(outputs.float().cpu().detach())
                labels = labels.float().cpu().detach()
                correct += (preds == labels).sum()

                if(debug):
                   for ex,label,pred in zip(examples,labels,preds):
                       print('{}: actual = {} ---> pred = {}'.format(idx,label.item(),pred.item()))
            
            # print('correct = ',correct)
            #accuracy = correct.float()/64 * 100
      
        accuracy = correct.float()/self.x_test.shape[0] * 100
        return test_loss, accuracy, all_predictions

## Run the Model

In [101]:
dp = DataPrepper(config=cfg,dataset=cfg['dataset'])
train_op = Trainer(config=cfg,DataPrepper=dp)
print(train_op.x_train)
print(train_op.y_train)

quote.tok.gt9.5000
subjdata.README.1.0
plot.tok.gt9.5000


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Train data size: x_train = { (8000, 150) } -- y_train = { (8000, 1) }
Test data size: x_test = { (2000, 150) } -- y_test = { (2000, 1) }
-- Batch size  8 --
-- Batch size  8 --
[[     0      0      0 ...  19736  25238    896]
 [     0      0      0 ...   1119   2915    896]
 [     0      0      0 ...      7   3256    896]
 ...
 [     0      0      0 ...    939 199937    896]
 [     0      0      0 ...   4669   4418    896]
 [     0      0      0 ...  94566   1564    896]]
[[1]
 [0]
 [0]
 ...
 [1]
 [0]
 [1]]


In [102]:
model,criterion,losses = train_op.train(train_data=train_op.train_dataloader)

ShallowCNN(
  (pretrained_embedding_table): Embedding(200000, 100)
  (conv1): Conv1d(150, 400, kernel_size=(4,), stride=(1,))
  (mp1): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=38400, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)
Sending model to Tesla K80  GPU
Epoch 0 ----> loss=497.25155 accuracy=75.05000
Epoch 1 ----> loss=340.23682 accuracy=85.86250
Epoch 2 ----> loss=302.44014 accuracy=87.52500
Epoch 3 ----> loss=289.52657 accuracy=88.10000
Epoch 4 ----> loss=277.07646 accuracy=88.53751
Epoch 5 ----> loss=261.78875 accuracy=89.33750
Epoch 6 ----> loss=244.35835 accuracy=90.61250
Epoch 7 ----> loss=232.91747 accuracy=90.17500
Epoch 8 ----> loss=216.20346 accuracy=91.23750
Epoch 9 ----> loss=211.17028 accuracy=91.56250
Epoch 10 ----> loss=205.96724 accuracy=91.70000
Epoch 11 ----> loss=186.72225 accuracy=92.47500
Epoch 12 ----> loss=176.01773 accuracy=93.15000
Epoch 13 ----> los

In [103]:
test_loss,acc,preds = train_op.test_validate(debug=True,model=train_op.model,test_data=train_op.test_dataloader,loss_fn=criterion)
print('Test Accuracy = {}%'.format(acc))
# 50 Epochs lr=0.0001 - 75% 
# 100 Epochs lr=0.0001 - 81% & 78.125
# 200 Epoch lr = 0.00001 - 76.5625
# 5000 Epoch lr = 0.000001 - 60.937%

0: actual = 1.0 ---> pred = 1.0
0: actual = 0.0 ---> pred = 0.0
0: actual = 1.0 ---> pred = 1.0
0: actual = 0.0 ---> pred = 0.0
0: actual = 0.0 ---> pred = 0.0
0: actual = 1.0 ---> pred = 1.0
0: actual = 0.0 ---> pred = 0.0
0: actual = 1.0 ---> pred = 1.0
1: actual = 0.0 ---> pred = 0.0
1: actual = 0.0 ---> pred = 0.0
1: actual = 0.0 ---> pred = 0.0
1: actual = 0.0 ---> pred = 0.0
1: actual = 1.0 ---> pred = 0.0
1: actual = 1.0 ---> pred = 1.0
1: actual = 1.0 ---> pred = 1.0
1: actual = 1.0 ---> pred = 1.0
2: actual = 0.0 ---> pred = 0.0
2: actual = 0.0 ---> pred = 0.0
2: actual = 1.0 ---> pred = 1.0
2: actual = 1.0 ---> pred = 1.0
2: actual = 1.0 ---> pred = 1.0
2: actual = 1.0 ---> pred = 1.0
2: actual = 0.0 ---> pred = 1.0
2: actual = 1.0 ---> pred = 1.0
3: actual = 1.0 ---> pred = 1.0
3: actual = 0.0 ---> pred = 0.0
3: actual = 0.0 ---> pred = 0.0
3: actual = 1.0 ---> pred = 1.0
3: actual = 1.0 ---> pred = 1.0
3: actual = 0.0 ---> pred = 0.0
3: actual = 1.0 ---> pred = 1.0
3: actua