### Make a dictionary, dataloader

In [1]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *
from sklearn.model_selection import train_test_split
from ast import literal_eval

In [2]:
# # Import prepocessed Dataset(already tokenized)
# with open("./data/master_df_labeled.p", 'rb') as handle:
#     datasets = pickle.load(handle)

In [3]:
datasets = pd.read_excel("./data/master_df_labeled.xlsx",index_col=0)

In [4]:
datasets.review = datasets.review.apply(literal_eval)

In [5]:
len(datasets)

111029

In [6]:
datasets = datasets.drop(labels='Unnamed: 0.1', axis=1)

In [7]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [8]:
datasets.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
0,"[thank, you, for, your, help, and, support, !]",support,6,1,support,-1
1,"[excellent, and, professional, -, highly, reco...",excellent,0,1,excellent,-1
2,"[superb, quality, .]",superb,0,1,superb,-1
3,"[outstanding, work, ,, will, definitely, use, ...",outstanding,0,1,outstanding,-1
4,"[good, provider]",provider,1,1,provider,-1


In [9]:
len(datasets)

107271

In [10]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [11]:
centroids_dataset, remains_dataset = train_test_split(labeled, test_size=0.2, stratify=labeled['true_pos'])

In [13]:
val_dataset, test_dataset = train_test_split(remains_dataset, test_size=0.5, stratify=remains_dataset['true_pos'])

In [14]:
len(labeled) == len(centroids_dataset) + len(val_dataset) + len(test_dataset)

True

In [15]:
val_idx = val_dataset.index.tolist()
test_idx = test_dataset.index.tolist()

remove_idx = val_idx + test_idx

In [16]:
train_dataset = datasets.drop(remove_idx, axis=0)

In [17]:
len(datasets) == len(train_dataset) + len(val_dataset) + len(test_dataset)

True

In [18]:
# Make a dictionary
review_dict = Dictionary(datasets, include_valid=False)

100%|██████████| 107271/107271 [00:17<00:00, 6037.37it/s]


In [47]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_dict = open(data_dir + "dictionary.p","wb")
pickle.dump(review_dict, pickle_dict)
pickle_dict.close()

In [20]:
review_dict.get_id("great")

34

In [21]:
review_dict.encode_token_seq(datasets.iloc[0,0])

[2, 3, 4, 5, 6, 7, 8, 9]

In [27]:
indexized_train_datasets = indexize_dataset(train_dataset, review_dict)
indexized_val_datasets = indexize_dataset(val_dataset, review_dict)
indexized_test_datasets = indexize_dataset(test_dataset, review_dict)
indexized_centroids_datasets = indexize_dataset(centroids_dataset, review_dict)

100%|██████████| 107059/107059 [00:00<00:00, 139953.67it/s]
100%|██████████| 106/106 [00:00<00:00, 121540.79it/s]
100%|██████████| 106/106 [00:00<00:00, 73547.76it/s]
100%|██████████| 845/845 [00:00<00:00, 107162.55it/s]


In [28]:
tensor_train_dataset = TensoredDataset(indexized_train_datasets,train_dataset["true_pos"].to_list(), train_dataset["flagged_index"].to_list())
tensor_val_dataset = TensoredDataset(indexized_val_datasets,val_dataset["true_pos"].to_list(), val_dataset["flagged_index"].to_list())
tensor_test_dataset = TensoredDataset(indexized_test_datasets,test_dataset["true_pos"].to_list(), test_dataset["flagged_index"].to_list())
tensor_centroids_dataset = TensoredDataset(indexized_centroids_datasets,centroids_dataset["true_pos"].to_list(), centroids_dataset["flagged_index"].to_list())

In [29]:
# check the first example
tensor_train_dataset[0]

(tensor([[2, 3, 4, 5, 6, 7, 8, 9]]), tensor(-1), tensor(6))

In [30]:
# Random seed
seed = 1029

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))


# Divide into train(95%), valid(5%) dataset
batch_size = 32

train_dataloader = DataLoader(tensor_train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
val_dataloader = DataLoader(tensor_val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
test_dataloader = DataLoader(tensor_test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
centroids_dataloader = DataLoader(tensor_centroids_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)

In [35]:
for i, x in enumerate(centroids_dataloader):
    print(x[1],x[2])
    break

tensor([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 0, 0, 1, 1, 0, 0]) tensor([ 0,  0,  1,  0,  2,  7,  2,  7,  5,  0, 11,  1,  7,  0,  8,  0,  3,  7,
         0,  1,  2,  5,  5,  0, 10,  8,  0,  0,  3,  5,  0,  4])


In [37]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_train_dataloader = open(data_dir + "train_dataloader.p","wb")
pickle.dump(train_dataloader, pickle_train_dataloader)
pickle_train_dataloader.close()

pickle_val_dataloader = open(data_dir + "val_dataloader.p","wb")
pickle.dump(val_dataloader, pickle_val_dataloader)
pickle_val_dataloader.close()

pickle_test_dataloader = open(data_dir + "test_dataloader.p","wb")
pickle.dump(test_dataloader, pickle_test_dataloader)
pickle_test_dataloader.close()

pickle_centroids_dataloader = open(data_dir + "centroids_dataloader.p","wb")
pickle.dump(centroids_dataloader, pickle_centroids_dataloader)
pickle_centroids_dataloader.close()