# Pre-Processing

In [1]:
import numpy as np
import pandas as pd

In [2]:
!ls ../src/data/original

readme.txt  test.csv  train.csv


In [3]:
seed = 42

In [4]:
splits = np.array([0.85, 0.1, 0.05])
indicator = "bert"
padding = "200"
file_suffix = "p-{}_p-{}".format(indicator, padding)
file_suffix

'p-bert_p-200'

In [5]:
orig_train_path =  u"../src/data/original/train.csv"
orig_test_path = u"../src/data/original/test.csv"
prep_train_path =  u"../src/data/processed/train_{}_s{}.csv".format(file_suffix, str(splits[0]))
prep_val_path =  u"../src/data/processed/val_{}_s{}.csv".format(file_suffix, str(splits[1]))
prep_test_path =  u"../src/data/processed/test_{}_s{}.csv".format(file_suffix, str(splits[2]))

In [6]:
%%time
train = pd.read_csv(orig_train_path, header=None)
test = pd.read_csv(orig_test_path, header=None)

data = pd.concat([train, test])

del train, test

data.columns = ["label", "title", "review"]
data.drop(columns=['title'], inplace=True)
data["label"] = data["label"] - 1
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
train, val, test = np.array_split(data, (splits[:-1].cumsum() * len(data)).astype(int))

del data
del train, val

#train = train.values.tolist()
#val = val.values.tolist()
test = test.values.tolist()

CPU times: user 25.8 s, sys: 2.16 s, total: 27.9 s
Wall time: 28.7 s


In [7]:
test[0]

[3,
 'A very unexpected continuation of the first book, but with the same emotional feel and realism of the times and hearts of the characters. It has many new unexpected turns, with lots of suspense, but leaves you completely satisfied with how everything ties together in the end. This was better than the first book in the area of sensuality, as it was not completely focused on it, this time. In my view it was a more balanced look at life. I was a little disappointed with some characters left hanging at the end, but overall it was a well written story as part of a series.']

In [8]:
from transformers import BertModel, BertTokenizer

In [9]:
import torch

In [10]:
sample = test[0]
tokenizerBert = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased').cuda()
unk_token = tokenizerBert._unk_token
pad_token = tokenizerBert._pad_token
padding = 200

In [11]:
sample[1]

'A very unexpected continuation of the first book, but with the same emotional feel and realism of the times and hearts of the characters. It has many new unexpected turns, with lots of suspense, but leaves you completely satisfied with how everything ties together in the end. This was better than the first book in the area of sensuality, as it was not completely focused on it, this time. In my view it was a more balanced look at life. I was a little disappointed with some characters left hanging at the end, but overall it was a well written story as part of a series.'

In [12]:
%%time
# Tokenize
batch = []

for i in range(256):
    batch.append(tokenizerBert.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True))

#sentence = tokenizer.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)
len(batch)

CPU times: user 594 ms, sys: 0 ns, total: 594 ms
Wall time: 592 ms


256

In [13]:
batch_gpu = torch.LongTensor(batch).cuda()

In [14]:
batch_gpu.shape

torch.Size([256, 200])

In [15]:
%%time
with torch.no_grad():
    res = embedding_model(batch_gpu)

CPU times: user 214 ms, sys: 20.2 ms, total: 234 ms
Wall time: 233 ms


In [16]:
embeddings = res[0].cpu()

In [21]:
embeddings.shape

torch.Size([256, 200, 768])

In [22]:
torch.mean(embeddings, 1).shape

torch.Size([256, 768])

In [None]:
del res
torch.cuda.empty_cache()

In [None]:
%%time
batchAsList = embeddings.tolist()

In [None]:
np.array(batchAsList[0]).shape

In [None]:
with open("test_out.dat", "ab") as out_file:
    for entry in batchAsList:
        X = np.array(entry)
        np.savetxt(out_file, X)

In [None]:
import numpy as np        
f=open('asd.dat','ab')
for iind in range(4):
    a=np.random.rand(10,10)
    np.savetxt(f,a)
f.close()

In [None]:
del embeddings

In [None]:
def preprocess(row, padding=200,
               tokenizer=tokenizerBert,
               embedder=embedding_model,
               unk_token=tokenizerBert._unk_token,
               pad_token=tokenizerBert._pad_token):

    # row = [label, review]
    
    # Tokenize, Vectorize and Pad
    sentence_as_int = tokenizer.encode(row[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)

    # X, Y
    return sentence_as_int, row[0]

In [None]:
import multiprocessing as mp

In [None]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_test = pool.imap(preprocess, test)

with open(prep_test_path, "w") as out_file:
    for X, Y in processed_test:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

del test, processed_test

In [None]:
del embedding_model

# Data Loaders

# Model Training