# Create Dictionary and Dataloader
This notebook creates the dictionary and dataloader for the baseline model, its three variations, and the LSTM model. It makes use of our custom functions found in `generate_dataloaders.py`. 

In [1]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *
from sklearn.model_selection import train_test_split
from ast import literal_eval

import nltk
import re
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

## Filter out Sentences with lengths greater than 30

In [2]:
datasets = pd.read_excel("./data/master_df_labeled.xlsx",index_col=0)

In [3]:
datasets.review = datasets.review.apply(literal_eval)

In [4]:
len(datasets)

111029

In [6]:
datasets = datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [8]:
len(datasets)

107271

## Train-Test split and Upsample Minority Class

In [9]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [10]:
train_labeled_dataset, remains_label_dataset = train_test_split(labeled, test_size=0.2, stratify=labeled['true_pos'])

In [11]:
class1 = train_labeled_dataset[train_labeled_dataset.true_pos == 1]
class0 = train_labeled_dataset[train_labeled_dataset.true_pos == 0]

In [12]:
from sklearn.utils import resample

df_0_upsampled = resample(class0, replace=True, n_samples=len(class1), random_state=1029)

# Combine majority class with upsampled minority class
upsampled_train_labeled = pd.concat([class1, df_0_upsampled])

In [13]:
print(len(upsampled_train_labeled[upsampled_train_labeled['true_pos']==0]),
len(upsampled_train_labeled[upsampled_train_labeled['true_pos']==1]))

3613 3613


In [14]:
val_dataset, test_dataset = train_test_split(remains_label_dataset, test_size=0.5, stratify=remains_label_dataset['true_pos'])

In [15]:
len(labeled) == len(train_labeled_dataset) + len(val_dataset) + len(test_dataset)

True

In [16]:
val_idx = val_dataset.index.tolist()
test_idx = test_dataset.index.tolist()

remove_idx = val_idx + test_idx

In [17]:
train_dataset = datasets.drop(remove_idx, axis=0)

In [18]:
train_label_idx = train_labeled_dataset.index.tolist()
train_unlabeled_dataset = train_dataset.drop(train_label_idx,axis=0)

In [20]:
len(train_dataset) == len(train_labeled_dataset) + len(train_unlabeled_dataset)

True

In [21]:
len(datasets) == len(train_dataset) + len(val_dataset) + len(test_dataset)

True

## Create Dictionary, Indexized, Tensored Datasets, and Dataloaders

In [22]:
# Make a dictionary
review_dict = Dictionary(datasets, include_valid=False)

100%|██████████| 107271/107271 [00:27<00:00, 3836.54it/s]


In [23]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_dict = open(data_dir + "dictionary.p","wb")
pickle.dump(review_dict, pickle_dict)
pickle_dict.close()

In [24]:
review_dict.get_id("great")

34

In [25]:
review_dict.encode_token_seq(datasets.iloc[0,0])

[2, 3, 4, 5, 6, 7, 8, 9]

In [26]:
indexized_train_dataset_labeled = indexize_dataset(upsampled_train_labeled, review_dict)
indexized_train_dataset_unlabeled = indexize_dataset(train_unlabeled_dataset, review_dict)
indexized_val_datasets = indexize_dataset(val_dataset, review_dict)
indexized_test_datasets = indexize_dataset(test_dataset, review_dict)
indexized_train_datasets = indexize_dataset(train_dataset, review_dict)

100%|██████████| 7226/7226 [00:00<00:00, 102622.93it/s]
100%|██████████| 102097/102097 [00:00<00:00, 214529.22it/s]
100%|██████████| 517/517 [00:00<00:00, 127443.74it/s]
100%|██████████| 518/518 [00:00<00:00, 159215.12it/s]
100%|██████████| 106236/106236 [00:00<00:00, 150504.47it/s]


In [27]:
tensor_train_dataset_labeled = TensoredDataset(indexized_train_dataset_labeled,upsampled_train_labeled["true_pos"].to_list(), upsampled_train_labeled["flagged_index"].to_list())
tensor_train_dataset_unlabeled = TensoredDataset(indexized_train_dataset_unlabeled,train_unlabeled_dataset["true_pos"].to_list(), train_unlabeled_dataset["flagged_index"].to_list())
tensor_val_dataset = TensoredDataset(indexized_val_datasets,val_dataset["true_pos"].to_list(), val_dataset["flagged_index"].to_list())
tensor_test_dataset = TensoredDataset(indexized_test_datasets,test_dataset["true_pos"].to_list(), test_dataset["flagged_index"].to_list())
tensor_train_dataset = TensoredDataset(indexized_train_datasets,train_dataset["true_pos"].to_list(), train_dataset["flagged_index"].to_list())

In [28]:
# check the first example
tensor_train_dataset_unlabeled[100]

(tensor([[311,  77, 312,  20, 313,  41, 225,   3, 249,  20,   7, 314, 239,  89,
            9,   3, 315, 316, 317,   9]]), tensor(-1), tensor(18))

In [29]:
# Random seed
seed = 1029

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))


# Divide into train(95%), valid(5%) dataset
batch_size = 32

train_labeled_dataloader = DataLoader(tensor_train_dataset_labeled, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
train_unlabeld_dataloader = DataLoader(tensor_train_dataset_unlabeled, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
test_dataloader = DataLoader(tensor_test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)

## Save Dataloaders

In [31]:
path = os.getcwd()
data_dir = path + '../data/'

pickle_train_labeled_dataloader = open(data_dir + "train_labeled_dataloader.p","wb")
pickle.dump(train_labeled_dataloader, pickle_train_labeled_dataloader)
pickle_train_labeled_dataloader.close()

pickle_train_unlabeled_dataloader = open(data_dir + "train_unlabeled_dataloader.p","wb")
pickle.dump(train_unlabeled_dataloader, pickle_train_unlabeled_dataloader)
pickle_train_unlabeled_dataloader.close()

pickle_val_dataloader = open(data_dir + "val_dataloader.p","wb")
pickle.dump(val_dataloader, pickle_val_dataloader)
pickle_val_dataloader.close()

pickle_test_dataloader = open(data_dir + "test_dataloader.p","wb")
pickle.dump(test_dataloader, pickle_test_dataloader)
pickle_test_dataloader.close()

pickle_train_dataloader = open(data_dir + "train_dataloader.p","wb")
pickle.dump(train_dataloader, pickle_train_dataloader)
pickle_train_dataloader.close()