### Make a dictionary, dataloader

In [1]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *
from sklearn.model_selection import train_test_split
from ast import literal_eval

import nltk
import re
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [47]:
#Update our running master_df_labeled with multiple new files
def batch_merge_into_master(master_file, batch_folder):
    batch_files = os.listdir(batch_folder)
    print(batch_files)
    master_df_labeled = pd.read_excel(master_file,index_col=0)
    for new_file in batch_files:
        print(new_file)
        if re.match('batch_[0-9]*\.xlsx',new_file):
            new_df = pd.read_excel(batch_folder+new_file,index_col=0)
            master_df_labeled.update(new_df.true_pos)
            print_tf_pos(master_df_labeled)
    master_df_labeled.to_excel(master_file)
    return master_df_labeled

def print_tf_pos(master_df_labeled):
    print(f"Count of true positives: {master_df_labeled[master_df_labeled.true_pos==1].shape[0]}")
    print(f"Count of false positives: {master_df_labeled[master_df_labeled.true_pos==0].shape[0]}")
    assert master_df_labeled[master_df_labeled.problematic==0].shape[0]==0

In [51]:
#All excel file paths here, pickle paths are used by other scripts so hardcoded
master_file = "./data/master_df_labeled.xlsx"

#batch_labeled = "./data/please_label_batch/labeled/"
batch_labeled = "/mnt/g/Google/Drive File Stream/My Drive/2019 Fall/Capstone group/FairFrame Annotations/labeled/"

In [None]:
#Print current true and false positive counts
#master_df_labeled.to_excel("./data/master_df_labeled.xlsx")
master_df_labeled = pd.read_excel(master_file,index_col=0)

print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

In [52]:
master_df_labeled = batch_merge_into_master("./data/master_df_labeled.xlsx", batch_labeled)
print_tf_pos(master_df_labeled)
print(master_df_labeled.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/g/Google/Drive File Stream/My Drive/2019 Fall/Capstone group/FairFrame Annotations/labeled/'

In [None]:
C:\Users\Jubly\AppData\Local\Packages\CanonicalGroupLimited.UbuntuonWindows_79rhkp1fndgsc\LocalState\rootfs\home\jubly\Fairframe

In [None]:
../../../../../../../../../../../../../../

In [2]:
# # Import prepocessed Dataset(already tokenized)
# with open("./data/master_df_labeled.p", 'rb') as handle:
#     datasets = pickle.load(handle)

In [2]:
datasets = pd.read_excel("./data/master_df_labeled.xlsx",index_col=0)

In [3]:
datasets.review = datasets.review.apply(literal_eval)

In [4]:
len(datasets)

111029

In [5]:
datasets = datasets.drop(labels='Unnamed: 0.1', axis=1)

In [6]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [7]:
datasets.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
0,"[thank, you, for, your, help, and, support, !]",support,6,1,support,-1
1,"[excellent, and, professional, -, highly, reco...",excellent,0,1,excellent,-1
2,"[superb, quality, .]",superb,0,1,superb,-1
3,"[outstanding, work, ,, will, definitely, use, ...",outstanding,0,1,outstanding,-1
4,"[good, provider]",provider,1,1,provider,-1


In [8]:
len(datasets)

107271

In [9]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [10]:
train_labeled_dataset, remains_label_dataset = train_test_split(labeled, test_size=0.2, stratify=labeled['true_pos'])

In [11]:
val_dataset, test_dataset = train_test_split(remains_label_dataset, test_size=0.5, stratify=remains_label_dataset['true_pos'])

In [12]:
len(labeled) == len(train_labeled_dataset) + len(val_dataset) + len(test_dataset)

True

In [13]:
val_idx = val_dataset.index.tolist()
test_idx = test_dataset.index.tolist()

remove_idx = val_idx + test_idx

In [14]:
train_dataset = datasets.drop(remove_idx, axis=0)

In [15]:
train_label_idx = train_labeled_dataset.index.tolist()
train_unlabeled_dataset = train_dataset.drop(train_label_idx,axis=0)

In [16]:
train_labeled_dataset.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
21745,"[excellent, designer, ,, works, well, and, qui...",excellent,0,1,excellent,1
42833,"[sooo, helpful, .]",helpful,1,1,helpful,1
9995,"[great, !]",great,0,1,great,1
49152,"[a, really, great, freelancer, .]",great,2,1,great,1
51524,"[very, professional, ,, patient, ,, understand...",understanding,5,1,understanding,1


In [17]:
len(train_dataset) == len(train_labeled_dataset) + len(train_unlabeled_dataset)

True

In [18]:
len(datasets) == len(train_dataset) + len(val_dataset) + len(test_dataset)

True

In [19]:
# Make a dictionary
review_dict = Dictionary(datasets, include_valid=False)

100%|██████████| 107271/107271 [00:17<00:00, 6217.81it/s]


In [21]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_dict = open(data_dir + "dictionary.p","wb")
pickle.dump(review_dict, pickle_dict)
pickle_dict.close()

In [22]:
review_dict.get_id("great")

34

In [23]:
review_dict.encode_token_seq(datasets.iloc[0,0])

[2, 3, 4, 5, 6, 7, 8, 9]

In [40]:
indexized_train_dataset_labeled = indexize_dataset(train_labeled_dataset, review_dict)
indexized_train_dataset_unlabeled = indexize_dataset(train_unlabeled_dataset, review_dict)
indexized_val_datasets = indexize_dataset(val_dataset, review_dict)
indexized_test_datasets = indexize_dataset(test_dataset, review_dict)
indexized_train_datasets = indexize_dataset(train_dataset, review_dict)

100%|██████████| 845/845 [00:00<00:00, 116092.47it/s]
100%|██████████| 106214/106214 [00:00<00:00, 151229.60it/s]
100%|██████████| 106/106 [00:00<00:00, 113013.78it/s]
100%|██████████| 106/106 [00:00<00:00, 36604.33it/s]
100%|██████████| 107059/107059 [00:00<00:00, 157041.21it/s]


In [42]:
tensor_train_dataset_labeled = TensoredDataset(indexized_train_dataset_labeled,train_labeled_dataset["true_pos"].to_list(), train_labeled_dataset["flagged_index"].to_list())
tensor_train_dataset_unlabeled = TensoredDataset(indexized_train_dataset_unlabeled,train_unlabeled_dataset["true_pos"].to_list(), train_unlabeled_dataset["flagged_index"].to_list())
tensor_val_dataset = TensoredDataset(indexized_val_datasets,val_dataset["true_pos"].to_list(), val_dataset["flagged_index"].to_list())
tensor_test_dataset = TensoredDataset(indexized_test_datasets,test_dataset["true_pos"].to_list(), test_dataset["flagged_index"].to_list())
tensor_train_dataset = TensoredDataset(indexized_train_datasets,train_dataset["true_pos"].to_list(), train_dataset["flagged_index"].to_list())

In [43]:
# check the first example
tensor_train_dataset_unlabeled[100]

(tensor([[304, 305,  30,  41,  19,  20,  41, 166, 167,  20,   7, 306,  53,  17]]),
 tensor(-1),
 tensor(7))

In [44]:
# Random seed
seed = 1029

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))


# Divide into train(95%), valid(5%) dataset
batch_size = 32

train_labeled_dataloader = DataLoader(tensor_train_dataset_labeled, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
train_unlabeld_dataloader = DataLoader(tensor_train_dataset_unlabeled, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
test_dataloader = DataLoader(tensor_test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)

In [31]:
for i, x in enumerate(centroids_dataloader):
    print(x[1],x[2])
    break

tensor([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 1, 1, 0]) tensor([ 1,  1,  0,  2,  0,  1, 22,  8,  0,  1,  0,  1,  0,  0,  3,  0, 11,  0,
         3,  3,  0,  5,  3,  0,  0,  9,  3,  3, 16,  2, 19, 26])


In [37]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_train_labelded_dataloader = open(data_dir + "train_labeled_dataloader.p","wb")
pickle.dump(train_labeled_dataloader, pickle_train_labeled_dataloader)
pickle_train_dataloader.close()

pickle_train_unlabeled_dataloader = open(data_dir + "train_unlabeld_dataloader.p","wb")
pickle.dump(train_unlabeled_dataloader, pickle_train_unlabled_dataloader)
pickle_train_dataloader.close()

pickle_val_dataloader = open(data_dir + "val_dataloader.p","wb")
pickle.dump(val_dataloader, pickle_val_dataloader)
pickle_val_dataloader.close()

pickle_test_dataloader = open(data_dir + "test_dataloader.p","wb")
pickle.dump(test_dataloader, pickle_test_dataloader)
pickle_test_dataloader.close()

pickle_train_dataloader = open(data_dir + "train_dataloader.p","wb")
pickle.dump(train_dataloader, pickle_train_dataloader)
pickle_train_dataloader.close()