# Generate Dataloaders for BERT models
BERT requires input data in a specific format, so this notebook creates BERT-specific dataloaders. Overall flow is same as in `generate_dataloaders` notebook.

In [1]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *
from sklearn.model_selection import train_test_split
from ast import literal_eval

from transformers import (
    BertModel,
    BertTokenizer
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers.data.processors.utils import InputExample
from torch.utils.data import TensorDataset, DataLoader, Dataset

## Filter out Sentences with lengths greater than 30

In [2]:
data_dir = path + '../data/'
datasets = pd.read_excel(os.path.join(data_dir, "master_df_labeled.xlsx"), index_col = 0)
datasets.review = datasets.review.apply(literal_eval)

In [5]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

## Train-Test split

In [6]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [7]:
#train_labeled
train_labeled_dataset, remains_label_dataset = train_test_split(labeled, test_size=0.2, stratify=labeled['true_pos'])

In [8]:
#val_dataset, test_dataset
val_dataset, test_dataset = train_test_split(remains_label_dataset, test_size=0.5, stratify=remains_label_dataset['true_pos'])

In [9]:
val_idx = val_dataset.index.tolist()
test_idx = test_dataset.index.tolist()

remove_idx = val_idx + test_idx

In [10]:
#train_dataset - not really needed
train_dataset = datasets.drop(remove_idx, axis=0)

In [11]:
train_label_idx = train_labeled_dataset.index.tolist()
#train_unlabeled
train_unlabeled_dataset = train_dataset.drop(train_label_idx, axis=0)

In [12]:
all_datasets = {"train_labeled_dataset":train_labeled_dataset,
               "val_dataset": val_dataset,
               "test_dataset":test_dataset,
               "train_unlabeled_dataset":train_unlabeled_dataset}

## Preprocess and Create Dataloaders

In [13]:
def preprocessing(all_df):
  preprocess_dict = {}
  for key, df in all_df.items():
    list_of_dicts=[]
    for row in df.iterrows():
      review = row[1].review
      idx = row[1].flagged_index
      text_a = " ".join(review[:idx+1])
      text_b = " ".join(review[idx:])
      temp_obj=InputExample(guid=str(row[0]), text_a=text_a, text_b=text_b, label=str(row[1].true_pos))
      list_of_dicts.append(temp_obj)
    preprocess_dict[key] = list_of_dicts
  return preprocess_dict

In [None]:
#Make dataset into format expected for creating BERT dataloaders
preprocess_dict = preprocessing(all_datasets)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [17]:
def to_tensored(dataset):
  features = convert_examples_to_features(dataset,
                                          tokenizer,
                                          label_list=['0','1','-1'],
                                          max_length=40,
                                          output_mode='classification',
                                          pad_on_left=False,
                                          pad_token=tokenizer.pad_token_id,
                                          pad_token_segment_id=0)
  tensored_dataset = TensorDataset(torch.tensor([f.input_ids for f in features], dtype=torch.long), 
                                torch.tensor([f.attention_mask for f in features], dtype=torch.long), 
                                torch.tensor([f.token_type_ids for f in features], dtype=torch.long), 
                                torch.tensor([f.label for f in features], dtype=torch.long))
  return tensored_dataset

In [18]:
bert_train_labeled_dataloader = DataLoader(to_tensored(preprocess_dict["train_labeled_dataset"]), batch_size=32)
bert_train_unlabeled_dataloader = DataLoader(to_tensored(preprocess_dict["train_unlabeled_dataset"]), batch_size=32)
bert_val_dataloader = DataLoader(to_tensored(preprocess_dict["val_dataset"]), batch_size=32)
bert_test_dataloader = DataLoader(to_tensored(preprocess_dict["test_dataset"]), batch_size=32)


In [19]:
with open(os.path.join(data_dir, "bert_train_labeled_dataloader.p"),"wb") as f:
  pickle.dump(bert_train_labeled_dataloader, f)

with open(os.path.join(data_dir, "bert_train_unlabeled_dataloader.p"),"wb") as f:
  pickle.dump(bert_train_unlabeled_dataloader, f)

with open(os.path.join(data_dir, "bert_val_dataloader.p"),"wb") as f:
  pickle.dump(bert_val_dataloader, f)

with open(os.path.join(data_dir, "bert_test_dataloader.p"),"wb") as f:
  pickle.dump(bert_test_dataloader, f)
