# Pre-Processing

In [1]:
import numpy as np
import pandas as pd

In [2]:
!ls ../src/data/original

readme.txt  test.csv  train.csv


In [3]:
seed = 42

In [4]:
splits = np.array([0.85, 0.1, 0.05])
indicator = "bert"
padding = "200"
file_suffix = "p-{}_p-{}".format(indicator, padding)
file_suffix

'p-bert_p-200'

In [5]:
orig_train_path =  u"../src/data/original/train.csv"
orig_test_path = u"../src/data/original/test.csv"
prep_train_path =  u"../src/data/processed/train_{}_s{}.csv".format(file_suffix, str(splits[0]))
prep_val_path =  u"../src/data/processed/val_{}_s{}.csv".format(file_suffix, str(splits[1]))
prep_test_path =  u"../src/data/processed/test_{}_s{}.csv".format(file_suffix, str(splits[2]))

In [6]:
%%time
train = pd.read_csv(orig_train_path, header=None)
test = pd.read_csv(orig_test_path, header=None)

data = pd.concat([train, test])

del train, test

data.columns = ["label", "title", "review"]
data.drop(columns=['title'], inplace=True)
data["label"] = data["label"] - 1
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
train, val, test = np.array_split(data, (splits[:-1].cumsum() * len(data)).astype(int))

del data
del train, val

#train = train.values.tolist()
#val = val.values.tolist()
test = test.values.tolist()

CPU times: user 22.6 s, sys: 1.14 s, total: 23.8 s
Wall time: 18.2 s


In [7]:
test[0]

[3,
 'A very unexpected continuation of the first book, but with the same emotional feel and realism of the times and hearts of the characters. It has many new unexpected turns, with lots of suspense, but leaves you completely satisfied with how everything ties together in the end. This was better than the first book in the area of sensuality, as it was not completely focused on it, this time. In my view it was a more balanced look at life. I was a little disappointed with some characters left hanging at the end, but overall it was a well written story as part of a series.']

In [8]:
from transformers import BertModel, BertTokenizer
import torch

In [9]:
sample = test[0]
tokenizerBert = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased').cuda()
unk_token = tokenizerBert._unk_token
pad_token = tokenizerBert._pad_token
padding = 200

In [10]:
sample[1]

'A very unexpected continuation of the first book, but with the same emotional feel and realism of the times and hearts of the characters. It has many new unexpected turns, with lots of suspense, but leaves you completely satisfied with how everything ties together in the end. This was better than the first book in the area of sensuality, as it was not completely focused on it, this time. In my view it was a more balanced look at life. I was a little disappointed with some characters left hanging at the end, but overall it was a well written story as part of a series.'

In [11]:
%%time
# Tokenize
batch = []

for i in range(256):
    batch.append(tokenizerBert.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True))

#sentence = tokenizer.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)
len(batch)

CPU times: user 575 ms, sys: 0 ns, total: 575 ms
Wall time: 575 ms


256

In [12]:
batch_gpu = torch.LongTensor(batch).cuda()

In [13]:
batch_gpu.shape

torch.Size([256, 200])

In [8]:
from transformers import DistilBertModel, DistilBertTokenizer
import torch

In [9]:
sample = test[0]
tokenizerBert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
embedding_model = DistilBertModel.from_pretrained('distilbert-base-uncased').cuda()
unk_token = tokenizerBert._unk_token
pad_token = tokenizerBert._pad_token
padding = 200

In [10]:
sample[1]

'A very unexpected continuation of the first book, but with the same emotional feel and realism of the times and hearts of the characters. It has many new unexpected turns, with lots of suspense, but leaves you completely satisfied with how everything ties together in the end. This was better than the first book in the area of sensuality, as it was not completely focused on it, this time. In my view it was a more balanced look at life. I was a little disappointed with some characters left hanging at the end, but overall it was a well written story as part of a series.'

In [19]:
tokenizerBert.tokenize("Hello, my dog is cute.")

['hello', ',', 'my', 'dog', 'is', 'cute', '.']

In [24]:
__padding = 200

In [26]:
len(tokenizerBert.encode("Hello, my dog is cute.", add_special_tokens=True,  max_length=__padding, pad_to_max_length=True))

200

In [22]:
torch.tensor(tokenizerBert.encode("Hello, my dog is cute.", add_special_tokens=True)).unsqueeze(0).to('cuda')  # Batch size 1

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,  1012,   102]],
       device='cuda:0')

In [17]:
tokenizerBert.convert_tokens_to_ids("[CLS]")

101

In [11]:
tokenizerBert.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)

[1037,
 2200,
 9223,
 13633,
 1997,
 1996,
 2034,
 2338,
 1010,
 2021,
 2007,
 1996,
 2168,
 6832,
 2514,
 1998,
 15650,
 1997,
 1996,
 2335,
 1998,
 8072,
 1997,
 1996,
 3494,
 1012,
 2009,
 2038,
 2116,
 2047,
 9223,
 4332,
 1010,
 2007,
 7167,
 1997,
 23873,
 1010,
 2021,
 3727,
 2017,
 3294,
 8510,
 2007,
 2129,
 2673,
 7208,
 2362,
 1999,
 1996,
 2203,
 1012,
 2023,
 2001,
 2488,
 2084,
 1996,
 2034,
 2338,
 1999,
 1996,
 2181,
 1997,
 18753,
 3012,
 1010,
 2004,
 2009,
 2001,
 2025,
 3294,
 4208,
 2006,
 2009,
 1010,
 2023,
 2051,
 1012,
 1999,
 2026,
 3193,
 2009,
 2001,
 1037,
 2062,
 12042,
 2298,
 2012,
 2166,
 1012,
 1045,
 2001,
 1037,
 2210,
 9364,
 2007,
 2070,
 3494,
 2187,
 5689,
 2012,
 1996,
 2203,
 1010,
 2021,
 3452,
 2009,
 2001,
 1037,
 2092,
 2517,
 2466,
 2004,
 2112,
 1997,
 1037,
 2186,
 1012,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [14]:
%%time
# Tokenize
batch = []

for i in range(256):
    batch.append(tokenizerBert.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True))

#sentence = tokenizer.encode(sample[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)
len(batch)

CPU times: user 575 ms, sys: 4.16 ms, total: 579 ms
Wall time: 577 ms


256

In [15]:
batch_gpu = torch.LongTensor(batch).cuda()

In [22]:
%%time
with torch.no_grad():
    res = embedding_model(batch_gpu)

CPU times: user 15.4 ms, sys: 364 µs, total: 15.8 ms
Wall time: 14.2 ms


In [17]:
%%time
with torch.no_grad():
    res = embedding_model(batch_gpu)

CPU times: user 5.92 ms, sys: 27.3 ms, total: 33.3 ms
Wall time: 31.2 ms


In [16]:
embeddings = res[0].cpu()

In [21]:
embeddings.shape

torch.Size([256, 200, 768])

In [22]:
torch.mean(embeddings, 1).shape

torch.Size([256, 768])

In [None]:
del res
torch.cuda.empty_cache()

In [None]:
%%time
batchAsList = embeddings.tolist()

In [None]:
np.array(batchAsList[0]).shape

In [None]:
with open("test_out.dat", "ab") as out_file:
    for entry in batchAsList:
        X = np.array(entry)
        np.savetxt(out_file, X)

In [None]:
import numpy as np        
f=open('asd.dat','ab')
for iind in range(4):
    a=np.random.rand(10,10)
    np.savetxt(f,a)
f.close()

In [None]:
del embeddings

In [None]:
def preprocess(row, padding=200,
               tokenizer=tokenizerBert,
               embedder=embedding_model,
               unk_token=tokenizerBert._unk_token,
               pad_token=tokenizerBert._pad_token):

    # row = [label, review]
    
    # Tokenize, Vectorize and Pad
    sentence_as_int = tokenizer.encode(row[1], add_special_tokens=False, max_length=padding, pad_to_max_length=True)

    # X, Y
    return sentence_as_int, row[0]

In [None]:
import multiprocessing as mp

In [None]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_test = pool.imap(preprocess, test)

with open(prep_test_path, "w") as out_file:
    for X, Y in processed_test:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

del test, processed_test

In [None]:
del embedding_model

# Data Loaders

# Model Training