### Make a dictionary, dataloader

In [1]:
import sys
import jsonlines
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np

In [20]:
# Import prepocessed Dataset(already tokenized)
with open("./data/ground_truths_df.p", 'rb') as handle:
    datasets = pickle.load(handle)

In [21]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [22]:
datasets.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,true_pos
9133,"[great, job, as, usual-above, and, beyond, cal...",great,0,1,0.0
14637,"[he, is, intelligent, ,, dedicated, ,, passion...",intelligent,2,1,1.0
15243,"[great, work, hire, him, again]",great,0,1,0.0
15541,"[strong, ability, to, execute, feedback, with,...",great,6,1,0.0
16159,"[awesome, +, dependable, designer, !]",dependable,2,1,1.0


In [23]:
class Dictionary(object):
    def __init__(self, datasets, include_valid=False):
        self.tokens = []
        self.ids = {}
        self.counts = {}
        
        # add special tokens
        self.add_token('<pad>')
        self.add_token('<unk>')
        
        for line in tqdm(datasets['review']):
            for w in line:
                self.add_token(w)

                            
    def add_token(self, w):
        if w not in self.tokens:
            self.tokens.append(w)
            _w_id = len(self.tokens) - 1
            self.ids[w] = _w_id
            self.counts[w] = 1
        else:
            self.counts[w] += 1

    def get_id(self, w):
        return self.ids[w]
    
    def get_token(self, idx):
        return self.tokens[idx]
    
    def decode_idx_seq(self, l):
        return [self.tokens[i] for i in l]
    
    def encode_token_seq(self, l):
        return [self.ids[i] if i in self.ids else self.ids['<unk>'] for i in l]
    
    def __len__(self):
        return len(self.tokens)

In [24]:
# Make a dictionary
review_dict = Dictionary(datasets, include_valid=False)

100%|██████████| 95/95 [00:00<00:00, 7860.70it/s]


In [None]:
pickle_dict = open(data_dir + "dictionary.p","wb")
pickle.dump(review_dict, pickle_dict)
pickle_dict.close()

In [25]:
review_dict.get_id("good")

131

In [26]:
review_dict.encode_token_seq(datasets.iloc[0,0])

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [27]:
def indexize_dataset(datasets, dictionary):
    indexized_datasets = []
    for l in tqdm(datasets["review"]):
        encoded_l = dictionary.encode_token_seq(l)
        indexized_datasets.append(encoded_l)
        
    return indexized_datasets

In [28]:
indexized_datasets = indexize_dataset(datasets, review_dict)

100%|██████████| 95/95 [00:00<00:00, 9843.84it/s]


In [29]:
class TensoredDataset(object):
    def __init__(self, list_of_lists_of_tokens, list_of_labels,list_of_flagged_indexes):
        self.input_tensors = []
        self.label_tensors = []
        self.flagged_index = []
        self.problematic = []
        
        for sample in list_of_lists_of_tokens:
            self.input_tensors.append(torch.tensor([sample], dtype=torch.long))
        for sample in list_of_labels:
            self.label_tensors.append(torch.tensor(sample, dtype=torch.long))
        for sample in list_of_flagged_indexes:
            self.flagged_index.append(torch.tensor(sample, dtype=torch.long))
        
    def __len__(self):
        return len(self.input_tensors)
    
    def __getitem__(self, idx):
        # return a (input, target) tuple
        return (self.input_tensors[idx], self.label_tensors[idx], self.flagged_index[idx])

In [30]:
tensor_dataset = TensoredDataset(indexized_datasets,datasets["true_pos"].to_list(), datasets["flagged_index"].to_list())

In [31]:
# check the first example
tensor_dataset[0]

(tensor([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11]]), tensor(0), tensor(0))

In [32]:
def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = 30
    padded_list = []
    
    for t in list_of_tensors:    
        padded_tensor = torch.cat([t, torch.tensor([[pad_token]*(max_length - t.size(-1))], dtype=torch.long)], dim = -1)
        padded_list.append(padded_tensor[:max_length])
        
    padded_tensor = torch.cat(padded_list, dim=0)
    
    return padded_tensor

def pad_collate_fn(batch):
    # batch is a list of sample tuples
    token_list = [s[0] for s in batch]
    label_list = torch.LongTensor([s[1] for s in batch])
    idx_list = torch.LongTensor([s[2] for s in batch])
    
    #pad_token = persona_dict.get_id('<pad>')
    pad_token = 0
    
    input_tensor = pad_list_of_tensors(token_list, pad_token)
    
    return input_tensor, label_list, idx_list

In [33]:
# Random seed
seed = 1029

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))


# Divide into train(95%), valid(5%) dataset
batch_size = 32
n_train_samples = int(0.95 * len(datasets))
n_val_samples = len(datasets) - n_train_samples

train_dataset, val_dataset = random_split(tensor_dataset, [n_train_samples, n_val_samples])

# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
dataloader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

In [34]:
for i, x in enumerate(val_dataloader):
    print(x[0][0],x[2][0])
    break

tensor([6328,   69,   34,   44,   19,   30,    7,   77,  230,    9,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]) tensor(8)


In [36]:
path = os.getcwd()
data_dir = path + '/data/'

# pickle_train_dataloader = open(data_dir + "train_dataloader.p","wb")
# pickle.dump(train_dataloader, pickle_train_dataloader)
# pickle_train_dataloader.close()

# pickle_val_dataloader = open(data_dir + "val_dataloader.p","wb")
# pickle.dump(val_dataloader, pickle_val_dataloader)
# pickle_val_dataloader.close()

pickle_dataloader = open(data_dir + "ground_truth_dataloader.p","wb")
pickle.dump(dataloader, pickle_dataloader)
pickle_dataloader.close()