In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence

from tqdm.notebook import tqdm
import os
import json
import matplotlib.pyplot as plt
from itertools import permutations
import re
import numpy as np
import random

torch.manual_seed(1)
torch.cuda.empty_cache()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [2]:
# read in vocab and embeddings from GloVe
glove_path = 'glove.6B.50d.txt'

embedding_dim = 50
glove_vocab_size = 400000

glove_vocab = []
glove_vectors = torch.empty((glove_vocab_size, embedding_dim), dtype=torch.float)

with open(glove_path, 'r', encoding='UTF-8') as f:
    for i, line in enumerate(tqdm(f.readlines())):
        parsed_line = line.split()
        glove_vocab += [parsed_line[0]]
        glove_vectors[i,:] = torch.reshape(torch.tensor(list(map(float, parsed_line[1:])), dtype=torch.float), (1,embedding_dim))

  0%|          | 0/400000 [00:00<?, ?it/s]

In [3]:
vocab = ['<pad>'] + glove_vocab
padding_index = 0

vocab_dict = {}
for idx in range(len(vocab)):
    vocab_dict[vocab[idx]] = idx
    
# read in train filepath
train_dir = 'review_polarity/txt_sentoken/train'
train_data_unshuffled = []

pos_dir = train_dir + '/pos'
for filename in tqdm(os.listdir(pos_dir)):
    if filename.endswith(".txt"):
        filepath = pos_dir + '/' + filename
        train_data_unshuffled += [(filepath, 1)]

neg_dir = train_dir + '/neg'
for filename in tqdm(os.listdir(neg_dir)):
    if filename.endswith(".txt"):
        filepath = neg_dir + '/' + filename
        train_data_unshuffled += [(filepath, 0)]
        
train_data = random.sample(train_data_unshuffled, len(train_data_unshuffled))
train_size = len(train_data)

train_vocab = []
for filename, _ in tqdm(train_data):
    with open(filename, 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            parsed_line = line.split()
            for token in parsed_line:
                if vocab_dict.get(token) == None:
                    token_split = re.split('([^a-zA-Z0-9])', token)
                    for sub_token in token_split:
                        if vocab_dict.get(sub_token) == None:
                            if sub_token != '':
                                vocab_dict[sub_token] = len(vocab_dict)
                                vocab += [sub_token]
                                train_vocab += [sub_token]

vocab_size = len(vocab)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
print(train_data)

[('review_polarity/txt_sentoken/train/pos/cv002_15918.txt', 1), ('review_polarity/txt_sentoken/train/neg/cv003_12683.txt', 0), ('review_polarity/txt_sentoken/train/pos/cv001_18431.txt', 1), ('review_polarity/txt_sentoken/train/pos/cv003_11664.txt', 1), ('review_polarity/txt_sentoken/train/neg/cv001_19502.txt', 0), ('review_polarity/txt_sentoken/train/neg/cv002_17424.txt', 0)]


In [5]:
print(random.sample(train_data_unshuffled, len(train_data_unshuffled)))

[('review_polarity/txt_sentoken/train/pos/cv003_11664.txt', 1), ('review_polarity/txt_sentoken/train/neg/cv001_19502.txt', 0), ('review_polarity/txt_sentoken/train/pos/cv001_18431.txt', 1), ('review_polarity/txt_sentoken/train/pos/cv002_15918.txt', 1), ('review_polarity/txt_sentoken/train/neg/cv003_12683.txt', 0), ('review_polarity/txt_sentoken/train/neg/cv002_17424.txt', 0)]


In [6]:
# read in train dataset
train_dir = 'review_polarity/txt_sentoken/train'

train_file_to_idx = {}
train_idx_to_file = np.empty(train_size, dtype=np.dtype('U100'))
train_idx_to_label = np.empty(train_size, dtype=int)

train_idx_to_vector = {}
train_idx_to_length = np.empty(train_size, dtype=int)

for idx, (filename, label) in tqdm(enumerate(train_data)):
    train_file_to_idx[filename] = idx
    train_idx_to_file[idx] = filename
    train_idx_to_label[idx] = label
    
    vector = []
    with open(filename, 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            parsed_line = line.split()
            for token in parsed_line:
                if vocab_dict.get(token) != None:
                    vector += [vocab_dict[token]]
                else:
                    token_split = re.split('([^a-zA-Z0-9])', token)
                    for sub_token in token_split:
                        if vocab_dict.get(sub_token) != None and sub_token != '':
                            vector += [vocab_dict[sub_token]]
    train_idx_to_vector[idx] = vector
    train_idx_to_length[idx] = len(vector)
    
train_idx = list(range(train_size))

0it [00:00, ?it/s]

In [7]:
'''print(train_file_to_idx)
print(train_idx_to_file)
print(train_idx_to_label)
print(train_idx_to_vector)
print(train_idx_to_length)'''

'print(train_file_to_idx)\nprint(train_idx_to_file)\nprint(train_idx_to_label)\nprint(train_idx_to_vector)\nprint(train_idx_to_length)'

In [8]:
for idx, (file, length, label) in enumerate(zip(train_idx_to_file, train_idx_to_length, train_idx_to_label)):\
    print(file, label)

review_polarity/txt_sentoken/train/pos/cv002_15918.txt 1
review_polarity/txt_sentoken/train/neg/cv003_12683.txt 0
review_polarity/txt_sentoken/train/pos/cv001_18431.txt 1
review_polarity/txt_sentoken/train/pos/cv003_11664.txt 1
review_polarity/txt_sentoken/train/neg/cv001_19502.txt 0
review_polarity/txt_sentoken/train/neg/cv002_17424.txt 0


In [9]:
batch_size = 3
train_loader = torch.utils.data.DataLoader(train_idx, batch_size=batch_size)

for index, batch_idxs in enumerate(tqdm(train_loader)):
    print(index, batch_idxs)


  0%|          | 0/2 [00:00<?, ?it/s]

0 tensor([0, 1, 2])
1 tensor([3, 4, 5])


In [10]:
# functions
def batch_to_padded_tensors(batch, mode):
    if mode == 'train':
        idx_to_vector = train_idx_to_vector
        idx_to_length = train_idx_to_length
    elif mode == 'dev':
        idx_to_vector = dev_idx_to_vector
        idx_to_length = dev_idx_to_length
    elif mode == 'test':
        idx_to_vector = test_idx_to_vector
        idx_to_length = test_idx_to_length
    
    size = len(batch)
    lengths = idx_to_length[batch]
    max_length = max(lengths)
    #print(lengths, max_length)
    
    tensors = torch.zeros((size, max_length), dtype=int).long()
    #print(tensors.shape)
    for i, idx in enumerate(batch):
        #print(i, idx, train_idx_to_length[idx])
        #print(train_idx_to_vector[idx.item()])
        tensors[i, 0:idx_to_length[idx]] = torch.LongTensor(idx_to_vector[idx.item()])
               
    return tensors.to(device)

def batch_to_labels(batch, mode):
    if mode == 'train':
        idx_to_label = train_idx_to_label
    elif mode == 'dev':
        idx_to_label = dev_idx_to_label
    elif mode == 'test':
        idx_to_label = test_idx_to_label
        
    return torch.LongTensor(idx_to_label[batch]).to(device)

In [84]:
 print(train_idx_to_vector[0])

[82, 58, 37139, 406, 1340, 851, 119677, 440, 74, 21, 8932, 5, 3, 7, 461, 5, 160, 1, 320, 8, 1046, 2, 65, 40, 41, 5, 89, 16, 1785, 56, 2709, 815, 6, 5063, 1570, 2, 34, 102, 594, 1, 2492, 11, 60, 56, 607, 6, 128, 5199, 1, 2244, 3, 85, 568, 2051, 16, 792, 6, 64, 15, 37, 30, 930, 47, 24066, 5227, 7, 21, 58, 1535, 720, 24, 21, 58, 1535, 5204, 8, 1337, 303339, 4, 1, 2856, 205, 1, 2604, 2, 92, 1024, 8, 307, 1195, 17826, 25, 3, 4656, 2, 21, 1433, 99, 6, 26109, 65, 8446, 4, 220, 3227, 18081, 3, 21, 58, 1535, 11014, 17615, 6, 23, 247, 14908, 48622, 2, 37, 5, 3863, 192, 31632, 3, 35, 3203, 2, 198, 19613, 13, 12533, 15, 3, 35, 64, 391, 31, 646, 69, 74, 1, 7029, 6, 12533, 13, 908, 1, 1006, 162, 20, 144, 20, 21, 261, 2, 114, 42, 3961, 13073, 1, 772, 3513, 275, 17811, 21066, 2, 32654, 7, 2557, 3, 21, 163652, 58, 2160, 34, 52, 1, 8045, 2, 114, 151, 1589, 36, 61363, 22, 1, 216, 788, 3, 42, 6015, 58, 2160, 1690, 554, 8155, 67, 103, 42, 5573, 101, 182, 60, 82, 58, 37139, 406, 1340, 2, 35, 128, 379, 2, 15

In [11]:
for index, batch in enumerate(tqdm(train_loader)):
    print(index, batch)
    batch_tensors = batch_to_padded_tensors(batch, 'train')
    batch_labels= batch_to_labels(batch, 'train')
    
    print(batch_tensors)
    print(batch_labels)

  0%|          | 0/2 [00:00<?, ?it/s]

0 tensor([0, 1, 2])
tensor([[   82,    58, 37139,  ...,     0,     0,     0],
        [    9,  6404,    11,  ...,     0,     0,     0],
        [  360,   115,     6,  ...,     8,  6713,     3]], device='cuda:0')
tensor([1, 0, 1], device='cuda:0')
1 tensor([3, 4, 5])
tensor([[    9, 20309,     9,  ...,     4,  2291,     3],
        [    1,  1752, 32308,  ...,     0,     0,     0],
        [   21,    15,  2460,  ...,     0,     0,     0]], device='cuda:0')
tensor([1, 0, 0], device='cuda:0')


In [76]:
def filename_to_tensor(filename):
    with open(filename, 'r') as f:
        text = f.read()
        parsed_text = text.split()
        #print(parsed_text)
        
        vector = []
        for i in range(len(parsed_text)):
            if parsed_text[i] in vocab:
                vector += [vocab_dict[parsed_text[i]]]

    return torch.LongTensor(vector).to(device)

def filename_to_vector(filename):
    with open(filename, 'r') as f:
        text = f.read()
        parsed_text = text.split()
        #print(parsed_text)
        
        vector = []
        for i in range(len(parsed_text)):
            if parsed_text[i] in vocab:
                vector += [vocab_dict[parsed_text[i]]]
    return vector, len(vector)

def filenames_to_vectors(filenames):
    size = len(filenames)
    max_length = 0
    vectors = []
    for filename in filenames:
        vector = filename_to_vector(filename)
        if len(vector) > max_length:
            max_length = len(vector)
        vectors += [filename_to_vector(filename)]
        
    return vectors, max_length

def filenames_to_padded_tensors(filenames):
    size = len(filenames)
    vectors, max_length = filenames_to_vectors(filenames)
    padded_vectors = []
    for vector in vectors:
        for i in range(len(vector), max_length):
            vector += [padding_index]
        padded_vectors += [vector]
    
    return torch.LongTensor(padded_vectors).to(device)

def batch_to_padded_tensors(batch_vectors, max_length):
    size = len(batch_vectors)
    padded_vectors = []
    for vector in batch_vectors:
        for i in range(len(vector), max_length):
            vector += [padding_index]
        padded_vectors += [vector]
    
    return torch.LongTensor(padded_vectors).to(device)

In [77]:
'''train_data_vectors = {}

for filename, _ in tqdm(train_data):
    file_vector = filename_to_vector(filename)
    train_data_vectors[filename] = file_vector'''

  0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
'''dev_data_vectors = {}

for filename, _ in tqdm(dev_data):
    file_vector = filename_to_vector(filename)
    dev_data_vectors[filename] = file_vector'''

  0%|          | 0/300 [00:00<?, ?it/s]

In [12]:
#parameters
hidden_size = 32
num_labels = 2

num_epochs = 5
learning_rate = 0.1

batch_size = 3

In [13]:
'''cwd = os.getcwd()
train_dir = cwd + '/review_polarity/txt_sentoken'
path1 = train_dir + '/example1.txt'
path2 = train_dir + '/example2.txt'
paths = [path1, path2]
print(filename_to_vector(path1))
print(filename_to_vector(path2))
print(filenames_to_vectors(paths))
print(filenames_to_padded_tensors(paths))'''

"cwd = os.getcwd()\ntrain_dir = cwd + '/review_polarity/txt_sentoken'\npath1 = train_dir + '/example1.txt'\npath2 = train_dir + '/example2.txt'\npaths = [path1, path2]\nprint(filename_to_vector(path1))\nprint(filename_to_vector(path2))\nprint(filenames_to_vectors(paths))\nprint(filenames_to_padded_tensors(paths))"

In [14]:
class RNN_Classifier(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_index)
        
        # initialize embeddings with glove embeddings
        with torch.no_grad():
            for i in range(glove_vocab_size):
                self.embedding.weight[i] = glove_vectors[i]
        
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.hidden_to_label = nn.Linear(hidden_size, num_labels)
    
    def forward(self, document_tensors):
        """
        document_vectors : batch_size x sequence_length
        """
        batch_size, sequence_length = document_tensors.shape
        hidden_0 = torch.zeros((1, batch_size, self.hidden_size),dtype=torch.float).to(device)
        
        #print(document_vectors.shape)
        
        document_embeddings = self.embedding(document_tensors)
        
        #print(document_embeddings.shape)
        rnn_out, _ = self.rnn(document_embeddings.view(batch_size, sequence_length, embedding_dim), hidden_0)
        pooling_out = self.pooling(rnn_out.transpose(1,2)).transpose(1,2)
        
        score = self.hidden_to_label(pooling_out).view(batch_size, -1)
        
        return F.log_softmax(score, dim=1)

In [15]:
model = RNN_Classifier(hidden_size).to(device)

In [16]:
def evaluate(model, data_loader, mode):
    true_positive_count = 0
    false_positive_count = 0
    true_negative_count = 0
    false_negative_count = 0

    with torch.no_grad():
        for index, batch in enumerate(tqdm(data_loader)):
            print(index, batch)
            current_batch_size = len(batch)
            
            batch_tensors = batch_to_padded_tensors(batch, mode)
            batch_labels = batch_to_labels(batch, mode)
            print(batch_tensors)
            print(batch_labels)

            log_probs = model(batch_tensors)
            
            print(log_probs)
            
            predicted_labels = torch.argmax(log_probs, 1)
            print(predicted_labels)

            for i in range(current_batch_size):
                if batch_labels[i] == 1 and predicted_labels[i] == 1:
                    true_positive_count += 1
                    #print('true pos', true_positive_count)
                elif batch_labels[i] == 0 and predicted_labels[i] == 1:
                    false_positive_count += 1
                    #print('false pos', false_positive_count)
                elif batch_labels[i] == 0 and predicted_labels[i] == 0:
                    true_negative_count += 1
                    #print('true neg', true_negative_count)
                elif batch_labels[i] == 1 and predicted_labels[i] == 0:
                    false_negative_count += 1
                    #print('false neg', false_negative_count)

    print(true_positive_count)
    print(false_positive_count)
    print(true_negative_count)
    print(false_negative_count)

    accuracy = (true_positive_count + true_negative_count) / (true_positive_count + false_positive_count + true_negative_count + false_negative_count)
    
    if true_positive_count + false_positive_count != 0:
        precision = true_positive_count / (true_positive_count + false_positive_count)
    else: 
        precision = 0
        
    recall = true_positive_count / (true_positive_count + false_negative_count)

    F1 = 2 * (precision * recall) / (precision + recall)

    #print('accuracy = {val:.3f}'.format(val = accuracy))
    #print('F1 = {val:.3f}'.format(val = F1))
    
    return accuracy, F1

In [108]:
def train(model, train_loader):
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    epoch_train_accuracy = [0] * num_epochs
    epoch_train_F1 = [0] * num_epochs
    
    epoch_dev_accuracy = [0] * num_epochs
    epoch_dev_F1 = [0] * num_epochs
    
    for epoch in range(num_epochs):
        print('epoch', epoch)
        for index, batch in enumerate(tqdm(train_loader)):
            print(index, batch)
            current_batch_size = len(batch)
            
            batch_tensors = batch_to_padded_tensors(batch, 'train')
            batch_labels = batch_to_labels(batch, 'train')
            print(batch_tensors)
            print(batch_labels)
    

            #document_tensors = filenames_to_padded_tensors(batch_files)
            #labels = labels.to(device)
            #print(document_tensors)
            #print(labels)
            
            log_probs = model(batch_tensors)
            
            print(log_probs)
            
            model.zero_grad()
            
            loss = F.nll_loss(log_probs, batch_labels)
            
            print(loss)
            loss.backward()
            optimizer.step()
        
        print('train')
        epoch_train_accuracy[epoch], epoch_train_F1[epoch] = evaluate(model, train_loader, 'train')
        print(epoch_train_accuracy[epoch], epoch_train_F1[epoch] )
        #print('dev')
        #epoch_dev_accuracy[epoch], epoch_dev_F1[epoch] = evaluate(model, dev_loader, 'dev')

In [109]:
train(model, train_loader)

epoch 0


  0%|          | 0/2 [00:00<?, ?it/s]

0 tensor([0, 1, 2])
tensor([[   82,    58, 37139,  ...,     0,     0,     0],
        [   21,    15,  2460,  ...,    65,  1139,     3],
        [    1,  1752, 32308,  ...,     0,     0,     0]], device='cuda:0')
tensor([1, 0, 0], device='cuda:0')
tensor([[-1.3651, -0.2948],
        [-1.2103, -0.3540],
        [-1.0695, -0.4204]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
tensor(0.8582, device='cuda:0', grad_fn=<NllLossBackward>)
1 tensor([3, 4, 5])
tensor([[    9,  6404,    11,  ...,     0,     0,     0],
        [  360,   115,     6,  ...,     0,     0,     0],
        [    9, 20309,     9,  ...,     4,  2291,     3]], device='cuda:0')
tensor([0, 1, 1], device='cuda:0')
tensor([[-0.3360, -1.2541],
        [-0.4240, -1.0626],
        [-0.4013, -1.1069]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
tensor(0.8352, device='cuda:0', grad_fn=<NllLossBackward>)
train


  0%|          | 0/2 [00:00<?, ?it/s]

0 tensor([0, 1, 2])
tensor([[   82,    58, 37139,  ...,     0,     0,     0],
        [   21,    15,  2460,  ...,    65,  1139,     3],
        [    1,  1752, 32308,  ...,     0,     0,     0]], device='cuda:0')
tensor([1, 0, 0], device='cuda:0')
tensor([[-1.3566, -0.2978],
        [-1.1300, -0.3902],
        [-0.9811, -0.4699]], device='cuda:0')
tensor([1, 1, 1], device='cuda:0')
1 tensor([3, 4, 5])
tensor([[    9,  6404,    11,  ...,     0,     0,     0],
        [  360,   115,     6,  ...,     0,     0,     0],
        [    9, 20309,     9,  ...,     4,  2291,     3]], device='cuda:0')
tensor([0, 1, 1], device='cuda:0')
tensor([[-1.2033, -0.3570],
        [-1.4420, -0.2698],
        [-1.4271, -0.2745]], device='cuda:0')
tensor([1, 1, 1], device='cuda:0')
3
3
0
0
0.5 0.6666666666666666


In [110]:
evaluate(model, train_loader, 'train')

  0%|          | 0/2 [00:00<?, ?it/s]

0 tensor([0, 1, 2])
tensor([[   82,    58, 37139,  ...,     0,     0,     0],
        [   21,    15,  2460,  ...,    65,  1139,     3],
        [    1,  1752, 32308,  ...,     0,     0,     0]], device='cuda:0')
tensor([1, 0, 0], device='cuda:0')
tensor([[-1.3566, -0.2978],
        [-1.1300, -0.3902],
        [-0.9811, -0.4699]], device='cuda:0')
tensor([1, 1, 1], device='cuda:0')
1 tensor([3, 4, 5])
tensor([[    9,  6404,    11,  ...,     0,     0,     0],
        [  360,   115,     6,  ...,     0,     0,     0],
        [    9, 20309,     9,  ...,     4,  2291,     3]], device='cuda:0')
tensor([0, 1, 1], device='cuda:0')
tensor([[-1.2033, -0.3570],
        [-1.4420, -0.2698],
        [-1.4271, -0.2745]], device='cuda:0')
tensor([1, 1, 1], device='cuda:0')
3
3
0
0


(0.5, 0.6666666666666666)

In [30]:
L = ['a', 'b', 'c']

In [31]:
L = np.array(L, dtype=str)

In [32]:
idx = [0, 2]

In [33]:
L[idx]

array(['a', 'c'], dtype='<U1')