In [1]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *
from sklearn.model_selection import train_test_split
from ast import literal_eval

from transformers import (
    BertModel,
    BertTokenizer
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers.data.processors.utils import InputExample
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [2]:
datasets = pd.read_excel("./data/master_df_labeled.xlsx",index_col=0)
datasets.review = datasets.review.apply(literal_eval)

### Check for uncaught 11's, update accordingly

In [3]:
#check for weird 11's
datasets[datasets.true_pos==11]

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos


In [4]:
#replace them!
datasets.true_pos[datasets.true_pos==11]=datasets.true_pos[datasets.true_pos==11].replace(11,1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [6]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [7]:
#train_labeled
train_labeled_dataset, remains_label_dataset = train_test_split(labeled, test_size=0.2, stratify=labeled['true_pos'])

In [8]:
#val_dataset, test_dataset
val_dataset, test_dataset = train_test_split(remains_label_dataset, test_size=0.5, stratify=remains_label_dataset['true_pos'])

In [9]:
val_idx = val_dataset.index.tolist()
test_idx = test_dataset.index.tolist()

remove_idx = val_idx + test_idx

In [10]:
#train_dataset - Atul doesn't really need
train_dataset = datasets.drop(remove_idx, axis=0)

In [11]:
train_label_idx = train_labeled_dataset.index.tolist()
#train_unlabeled
train_unlabeled_dataset = train_dataset.drop(train_label_idx,axis=0)

In [12]:
all_datasets = {"train_labeled_dataset":train_labeled_dataset,
               "val_dataset": val_dataset,
               "test_dataset":test_dataset,
               "train_unlabeled_dataset":train_unlabeled_dataset}

In [13]:
def preprocessing(all_df):
  preprocess_dict = {}
  for key, df in all_df.items():
    list_of_dicts=[]
    for row in df.iterrows():
      review = row[1].review
      idx = row[1].flagged_index
      text_a = " ".join(review[:idx+1])
      text_b = " ".join(review[idx:])
      temp_obj=InputExample(guid=str(row[0]), text_a=text_a, text_b=text_b, label=str(row[1].true_pos))
      list_of_dicts.append(temp_obj)
    preprocess_dict[key] = list_of_dicts
  return preprocess_dict

In [14]:
#Make dataset into format expected for creating bert dataloaders
preprocess_dict = preprocessing(all_datasets)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [16]:
preprocess_dict["train_labeled_dataset"]

[{
   "guid": "21264",
   "label": "1",
   "text_a": "as usual , jared did a great",
   "text_b": "great work and understood exactly what we were looking for ."
 }, {
   "guid": "35999",
   "label": "1",
   "text_a": "good php/db skills",
   "text_b": "skills for complex project 4 ."
 }, {
   "guid": "72400",
   "label": "0",
   "text_a": "i felt very trusting and had confidence in him both personally and professionally when sharing",
   "text_b": "sharing my data , a class job , thanks"
 }, {
   "guid": "84688",
   "label": "1",
   "text_a": "she is an extremely creative",
   "text_b": "creative and clever designer and we are delighted to have been able to work with her ."
 }, {
   "guid": "81397",
   "label": "1",
   "text_a": "professional and pro active",
   "text_b": "active freelancer"
 }, {
   "guid": "88113",
   "label": "1",
   "text_a": "nothing was an issue and he provided a great",
   "text_b": "great service ."
 }, {
   "guid": "127998",
   "label": "1",
   "text_a": "grea

In [17]:
def to_tensored(dataset):
  features = convert_examples_to_features(dataset,
                                          tokenizer,
                                          label_list=['0','1','-1'],
                                          max_length=40,
                                          output_mode='classification',
                                          pad_on_left=False,
                                          pad_token=tokenizer.pad_token_id,
                                          pad_token_segment_id=0)
  tensored_dataset = TensorDataset(torch.tensor([f.input_ids for f in features], dtype=torch.long), 
                                torch.tensor([f.attention_mask for f in features], dtype=torch.long), 
                                torch.tensor([f.token_type_ids for f in features], dtype=torch.long), 
                                torch.tensor([f.label for f in features], dtype=torch.long))
  return tensored_dataset

In [18]:
bert_train_labeled_dataloader = DataLoader(to_tensored(preprocess_dict["train_labeled_dataset"]), batch_size=32)
bert_train_unlabeled_dataloader = DataLoader(to_tensored(preprocess_dict["train_unlabeled_dataset"]), batch_size=32)
bert_val_dataloader = DataLoader(to_tensored(preprocess_dict["val_dataset"]), batch_size=32)
bert_test_dataloader = DataLoader(to_tensored(preprocess_dict["test_dataset"]), batch_size=32)


In [19]:
path = os.getcwd()
data_dir = path + '/data/'

with open(data_dir + "bert_train_labeled_dataloader.p","wb") as f:
  pickle.dump(bert_train_labeled_dataloader, f)

with open(data_dir + "bert_train_unlabeled_dataloader.p","wb") as f:
  pickle.dump(bert_train_unlabeled_dataloader, f)

with open(data_dir + "bert_val_dataloader.p","wb") as f:
  pickle.dump(bert_val_dataloader, f)

with open(data_dir + "bert_test_dataloader.p","wb") as f:
  pickle.dump(bert_test_dataloader, f)
