In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as tud
from collections import Counter, defaultdict
import operator
import os, math
import numpy as np
import random
import copy
import pandas as pd

In [2]:
import pandas as pd

In [3]:
# set the random seeds so the experiments can be replicated exactly
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

In [4]:
data = pd.read_csv('Data/data_prepr.csv')
data = data[['tagged_rel', 'tagged_str']]

data.tagged_str = data.tagged_str.str.replace('AGGRESOR', ' AGGRESOR ')
data.tagged_str = data.tagged_str.str.replace('VICTIM', ' VICTIM ')

In [5]:
def split_data(data):
    # reset index
    data_idx = data.reset_index()
    
    # subsetting in .75 train and even random split the other
    dev_test_size = round(len(data_idx)*0.25)
    tr_df = data_idx[dev_test_size:]
    dev_test_df = data_idx[:dev_test_size]
    ran_idx = np.random.choice(dev_test_size, round(dev_test_size/2), replace=False)
    dev_df = dev_test_df.iloc[ran_idx]
    test_df = dev_test_df.iloc[~ran_idx]
    
    #shuffling training so it doesn't see similar cases one after the other 
    tr_df = tr_df.sample(frac=1).reset_index(drop=True)
        
    # make them lists
    train_l = tr_df.values.tolist()
    dev_l = dev_df.values.tolist()
    test_l = test_df.values.tolist()
    all_l = data_idx.values.tolist()

    return train_l, dev_l, test_l, all_l

In [6]:
train_list, dev_list, test_list, all_list = split_data(data)

In [7]:
#Mini subset of data for initial test

small_train = train_list[:20]

In [8]:
### Process text

def remove_stopwords(l):
    STOP  = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'] 
    l_clean = []
    for i in STOP: 
        if i in l:
            l_clean.append(i)
    return l_clean 

def word_tokenize(s):
    split_l = s.lower().replace('.', '').replace(',', '').replace(';', '').replace(':', '').replace('!', '').replace('?', '').replace('(', '').replace(')', '').split()
    #clean_l = remove_stopwords(split_l)
    #return clean_l
    
    return split_l

VOCAB_SIZE = 5000
class textModel:
    def __init__(self, data):
        # Vocabulary is a set that stores every word seen in the 
        # training data
        self.vocab_counts = Counter([word for ix, label, content in data 
                              for word in word_tokenize(content)]
                            ).most_common(VOCAB_SIZE-1)
        # word to index mapping
        self.word_to_idx = {k[0]: v+1 for v, k in 
                            enumerate(self.vocab_counts)}
        # all the unknown words will be mapped to index 0
        self.word_to_idx["UNK"] = 0 
        self.idx_to_word = {v:k for k, v in self.word_to_idx.items()}

        self.vocab = set(self.word_to_idx.keys())
        self.vocab_size = len(self.vocab)
        
        
    def train_model(self, data):
        '''
        Train the model with the provided training data
        '''
        raise NotImplementedError
        
    def classify(self, data):
        '''
        Classify the documents with the model
        '''
        raise 
        
class labels:
    def __init__(self, data):
        # Vocabulary is a set that stores every word seen in the 
        # training data
        self.label_counts = list(Counter([label for ix, label, content in data]).items())
        # word to index mapping
        self.label_to_idx = {k[0]: v+1 for v, k in 
                            enumerate(self.label_counts)}
        # all the unknown words will be mapped to index 0
        self.idx_to_label = {v:k for k, v in self.label_to_idx.items()}

        self.labels = set(self.label_to_idx.keys())
        self.labels_size = len(self.labels)

        
class TextClassificationDataset(tud.Dataset):
    '''
    PyTorch provides a common dataset interface. 
    See https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    The dataset encodes documents into indices. 
    With the PyTorch dataloader, you can easily get batched data for 
    training and evaluation. 
    '''
    def __init__(self, data, labd):
        
        self.data = data
        self.vocab_size = VOCAB_SIZE
        self.tm = textModel(data)
        self.labs = labd

        
    def __len__(self):
        return len(self.data)
    
    def wordToTensor(self, word):
        tensor = torch.zeros(1, VOCAB_SIZE, dtype=torch.long)
        tensor[0][tm.word_to_idx[word]] = 1
        return tensor

    # Turn text into a <line_length x 1 x n_letters>,
    def lineToTensor(self, textstr):
        text = word_tokenize(textstr)
        to_ix = tm.word_to_idx
        tensor = autograd.Variable(torch.LongTensor([to_ix[w] if w in to_ix else to_ix['UNK']for w in text]))

#         text = word_tokenize(textstr)
#         tensor = torch.zeros(len(text), 1, VOCAB_SIZE, dtype=torch.long)
#         for wi, word in enumerate(text):
#             if word in self.tm.word_to_idx: 
#                 tensor[wi][0][self.tm.word_to_idx[word]] = 1
#             else: 
#                 # I add this exception in order to map those words not in vocab
#                 tensor[wi][0][self.tm.word_to_idx['UNK']] = 1

        return tensor
    
    def targetToTensor(self, lab): 

        label_to_ix = self.labs.label_to_idx
        l = label_to_ix [lab]
#         tensor = torch.zeros(1, len(label_to_ix), dtype=torch.long)
#         print(len(label_to_ix))
#         tensor[0][l] = 1
        target_idx = torch.tensor([[l]])
        return target_idx
    
    
    def __getitem__(self, idx):        
        datum = self.data[idx]
        label = self.targetToTensor(datum[1])
        item = self.lineToTensor(datum[2])
        
        return item, label


In [10]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.autograd as autograd

# https://github.com/claravania/lstm-pytorch/blob/master/model.py
torch.manual_seed(1)

class LSTMRNN(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(LSTMRNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,batch_first=True, bidirectional=False)
        self.hidden2cat = nn.Linear(hidden_dim, output_size)
        self.hidden = self.initHidden()
        self.optimizer = torch.optim.Adam(self.parameters())

        # From HW
        self.loss_fn = nn.NLLLoss(size_average=False)
        self.num_epochs = 50
        self.best_model = {}
        
    def initHidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                    autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        #print(batch.size(-1))
        embeds = self.word_embeddings(sentence)
        # embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        
        # ht is the last hidden state of the sequences
        out_space = self.hidden2cat(self.hidden[0][-1])
        #out_space = self.hidden2cat(lstm_out)
        output = F.log_softmax(out_space)
        
        return output
    

    def train_epoch(self, train_data, l):
        '''
        '''
        data = TextClassificationDataset(train_data, l)
        data = tud.DataLoader(data, batch_size = 1, shuffle = True)
        # Forward pass
            
        for sent, targets in data: 
            self.zero_grad() 
            self.hidden = self.initHidden()
            pred = self.forward(sent)
            
            # Calculate loss with prediction and target labels

            loss = self.loss_fn(pred, targets[0][0])

            # Backward pass
            loss.backward()
            
            # Apply optimizer step. 
            self.optimizer.step()
            
    def classify(self, val_data, l):
        '''
        This function classifies/predicts documents into their categories. 
        '''
        classif = []
        cuml_loss = 0
        
        data = TextClassificationDataset(val_data, l)
        data = tud.DataLoader(data, batch_size = 1, shuffle = True)
        with torch.no_grad():
            for sent, targets in data: 
                self.hidden = self.initHidden()
                pred = self.forward(sent)

                # Calculate loss with prediction and target labels
                loss = self.loss_fn(pred, targets[0][0])
                cuml_loss += loss

                # fingin the max label for accuracy
                pred_idx = ((torch.topk(pred, 1)[1])[0]).item()
                
                pred_rel = l.idx_to_label[pred_idx]

                # storing into output
                targ_rel = l.idx_to_label[(targets[0][0].item())]
                classif.append((targ_rel, sent, pred_rel))

        return classif, cuml_loss            
                
                
    def evaluate_classifier_accuracy(self, classif_dat):
        '''
        This function evaluates the data with the current model. 
        data contains both documents and labels. 
        It calls classify() to make predictions, 
        and compares with the correct labels to return 
        the model accuracy on "data". 
        '''
        num_errors = 0
        # compares target to pred and calculates error rate
        for target, note, pred in classif_dat:

            if pred != target: 
                num_errors += 1
        pred_error = num_errors/len(classif_dat) 
        
        return (1 - pred_error)
    
    
    def train_model(self, train_data, dev_data, l):
        """
        This function processes the entire training set for multiple epochs.
        After each training epoch, evaluate your model on the DEV set. 
        Save the best performing model on the DEV set to best_model
        """          

        best_acc = 0
        for i in range(self.num_epochs):
            self.train_epoch(train_data, l)
            classif, cuml_loss = self.classify(dev_data, l)
            acc = self.evaluate_classifier_accuracy(classif)
            print('epoch: {}; accuracy: {}; NLLLoss: {}'.format(i, acc, cuml_loss))
            if acc > best_acc:
                best_acc = acc 
                self.best_model['model'] = copy.deepcopy(self)
                self.best_model['acc'] = acc

In [11]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.autograd as autograd

# https://github.com/claravania/lstm-pytorch/blob/master/model.py
torch.manual_seed(1)

class BiDirLSTMRNN(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(BiDirLSTMRNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,batch_first=True, bidirectional=True)
        self.hidden2cat = nn.Linear(hidden_dim, output_size)
        self.hidden = self.initHidden()
        self.optimizer = torch.optim.Adam(self.parameters())

        # From HW
        self.loss_fn = nn.NLLLoss(size_average=False)
        self.num_epochs = 50
        self.best_model = {}
        
    def initHidden(self):
        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
            autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))
        


In [12]:
EMB_DIM = 6
HID_DIM = 6
labs = labels(all_list)
tm = textModel(small_train)
tm.vocab_size


366

In [None]:
# Running 50 epochs of non-bidirectional LSTM 
model_lstm = LSTMRNN(EMB_DIM, HID_DIM, tm.vocab_size, labs.labels_size)

model_lstm.train_model(small_train, dev_list, labs)



epoch: 0; accuracy: 0.061849873488895146; NLLLoss: 13887.16015625
epoch: 1; accuracy: 0.07197076187798712; NLLLoss: 13762.19921875
epoch: 2; accuracy: 0.08462187236435204; NLLLoss: 13647.7646484375


In [182]:
# Running 50 epochs of non-bidirectional LSTM 
pass 
# ----- code for tuning 

In [84]:
# Running 50 epochs of bidirectional LSTM 
model_bd = BiDirLSTMRNN(EMB_DIM, HID_DIM, tm.vocab_size, labs.labels_size)


model_bd.train_model(train_list, dev_list, labs)
# ----- code for tuning 

TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
# ------ evaluate both's best model on test
pass

In [14]:
label_to_ix = labs.label_to_idx
tensor = torch.zeros(1, len(label_to_ix), dtype=torch.long)
tensor[0][12] = 1

In [16]:
tensor.shape

torch.Size([1, 59])

In [77]:
35+8+8+8


59