### Make a dictionary, dataloader

In [77]:
import sys
from tqdm import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, Dataset, RandomSampler, SequentialSampler
import pickle
import random
import numpy as np
import pandas as pd
from generate_dataloaders import *

from ast import literal_eval

In [78]:
# Import prepocessed Dataset(already tokenized)
with open("./data/master_df_labeled.p", 'rb') as handle:
    datasets = pickle.load(handle)

In [88]:
datasets = pd.read_excel("./data/master_df_labeled.xlsx",index_col=0)
# df = pd.read_csv("in.csv",converters={"Col3": lambda x: x.strip("[]").split(", ")})

In [89]:
datasets.review = datasets.review.apply(literal_eval)

In [90]:
len(datasets)

111029

In [91]:
datasets = datasets.drop(labels='Unnamed: 0.1', axis=1)

In [92]:
datasets=datasets[datasets['review'].apply(lambda x: len(x)<=30)]

In [93]:
datasets.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
0,"[thank, you, for, your, help, and, support, !]",support,6,1,support,-1
1,"[excellent, and, professional, -, highly, reco...",excellent,0,1,excellent,-1
2,"[superb, quality, .]",superb,0,1,superb,-1
3,"[outstanding, work, ,, will, definitely, use, ...",outstanding,0,1,outstanding,-1
4,"[good, provider]",provider,1,1,provider,-1


In [94]:
len(datasets)

107271

In [95]:
# Divide ground_truth(test) and train/val dataset
labeled = datasets.loc[datasets['true_pos'].isin([1,0])]
unlabeled = datasets[datasets.true_pos==-1]

In [96]:
ground_truth_df = labeled.sample(n=int(len(labeled)/2))

In [97]:
ground_truth_idx = ground_truth_df.index.tolist()

In [98]:
datasets_wg = datasets.drop(ground_truth_idx, axis=0)

In [99]:
datasets_wg.head()

Unnamed: 0,review,flagged_word,flagged_index,problematic,lemmatized,true_pos
0,"[thank, you, for, your, help, and, support, !]",support,6,1,support,-1
1,"[excellent, and, professional, -, highly, reco...",excellent,0,1,excellent,-1
2,"[superb, quality, .]",superb,0,1,superb,-1
3,"[outstanding, work, ,, will, definitely, use, ...",outstanding,0,1,outstanding,-1
4,"[good, provider]",provider,1,1,provider,-1


In [100]:
print(len(datasets) == len(ground_truth_df) + len(datasets_wg))

True


In [101]:
# Make a dictionary
review_dict = Dictionary(datasets, include_valid=False)

100%|██████████| 107271/107271 [00:17<00:00, 6050.33it/s]


In [102]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_dict = open(data_dir + "dictionary.p","wb")
pickle.dump(review_dict, pickle_dict)
pickle_dict.close()

In [103]:
review_dict.get_id("great")

34

In [104]:
review_dict.encode_token_seq(datasets.iloc[0,0])

[2, 3, 4, 5, 6, 7, 8, 9]

In [105]:
indexized_datasets = indexize_dataset(datasets, review_dict)
indexized_ground_truth = indexize_dataset(ground_truth_df, review_dict)

100%|██████████| 107271/107271 [00:00<00:00, 191068.68it/s]
100%|██████████| 528/528 [00:00<00:00, 140164.08it/s]


In [106]:
tensor_dataset = TensoredDataset(indexized_datasets,datasets["true_pos"].to_list(), datasets["flagged_index"].to_list())
tensor_dataset_ground_truth = TensoredDataset(indexized_ground_truth,ground_truth_df["true_pos"].to_list(), ground_truth_df["flagged_index"].to_list())

In [107]:
# check the first example
tensor_dataset[0]

(tensor([[2, 3, 4, 5, 6, 7, 8, 9]]), tensor(-1), tensor(6))

In [108]:
# Random seed
seed = 1029

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))


# Divide into train(95%), valid(5%) dataset
batch_size = 32
n_train_samples = int(0.95 * len(datasets))
n_val_samples = len(datasets) - n_train_samples

train_dataset, val_dataset = random_split(tensor_dataset, [n_train_samples, n_val_samples])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=pad_collate_fn, worker_init_fn=_init_fn)
ground_truth_dataloader = DataLoader(tensor_dataset_ground_truth, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

In [109]:
for i, x in enumerate(ground_truth_dataloader):
    print(x[1],x[2])
    break

tensor([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 1, 1, 1]) tensor([ 6,  3,  6,  9,  0,  0,  4,  3,  5,  5, 11,  0,  2,  0,  0,  3,  9,  0,
        12,  1,  1,  1,  0,  0,  2,  8,  4,  2, 11,  3,  0,  4])


In [None]:
path = os.getcwd()
data_dir = path + '/data/'

pickle_train_dataloader = open(data_dir + "train_dataloader.p","wb")
pickle.dump(train_dataloader, pickle_train_dataloader)
pickle_train_dataloader.close()

pickle_val_dataloader = open(data_dir + "val_dataloader.p","wb")
pickle.dump(val_dataloader, pickle_val_dataloader)
pickle_val_dataloader.close()

pickle_dataloader = open(data_dir + "ground_truth_dataloader.p","wb")
pickle.dump(ground_truth_dataloader, pickle_dataloader)
pickle_dataloader.close()