# Improved Pre-Processing

In [1]:
!ls ../src/data/original

readme.txt  test.csv  train.csv


In [2]:
import pandas as pd
import numpy as np
import csv

In [3]:
seed = 42

In [4]:
splits = np.array([0.85, 0.1, 0.05])
indicator = "w2v"
embedding = "1m"
padding = "200"
file_suffix = "p-{}_e-{}_p-{}".format(indicator, embedding, padding)
file_suffix

'p-w2v_e-1m_p-200'

In [18]:
orig_train_path =  u"../src/data/original/train.csv"
orig_test_path = u"../src/data/original/test.csv"
prep_train_path =  u"../src/data/processed/train_{}_s{}.csv".format(file_suffix, str(splits[0]))
prep_val_path =  u"../src/data/processed/val_{}_s{}.csv".format(file_suffix, str(splits[1]))
prep_test_path =  u"../src/data/processed/test_{}_s{}.csv".format(file_suffix, str(splits[2]))

In [19]:
prep_train_path

'../src/data/processed/train_p-w2v_e-1m_p-200_s0.85.csv'

In [6]:
%%time
train = pd.read_csv(orig_train_path, header=None)
test = pd.read_csv(orig_test_path, header=None)

data = pd.concat([train, test])

del train, test

data.columns = ["label", "title", "review"]
data.drop(columns=['title'], inplace=True)
data["label"] = data["label"] - 1
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
train, val, test = np.array_split(data, (splits[:-1].cumsum() * len(data)).astype(int))

del data

train = train.values.tolist()
val = val.values.tolist()
test = test.values.tolist()

CPU times: user 27.1 s, sys: 1.42 s, total: 28.5 s
Wall time: 23 s


In [7]:
from transformers import BertTokenizer

In [8]:
%%time
tokenizerBert = BertTokenizer.from_pretrained('bert-base-uncased')

CPU times: user 46.3 ms, sys: 8.35 ms, total: 54.6 ms
Wall time: 705 ms


In [9]:
%%time
import gensim
import numpy as np
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=1_000_000)
embedding_model.add(tokenizerBert._unk_token, np.mean(embedding_model.vectors, axis=0),replace=False)
embedding_model.add(tokenizerBert._pad_token, np.zeros(300),replace=False)

CPU times: user 31.1 s, sys: 2.24 s, total: 33.3 s
Wall time: 32 s


In [10]:
def preprocess(row, padding=200,
               tokenizer=tokenizerBert,
               embedder=embedding_model,
               unk_token=tokenizerBert._unk_token,
               pad_token=tokenizerBert._pad_token):

    # row = [label, review]
    
    # Tokenize
    sentence = tokenizer.tokenize(row[1][:padding])

    # Pad
    sentence = sentence + [pad_token]*(padding - len(sentence))
    
    # Unknown words
    filled_sentence = [word if embedder.vocab.get(word) is not None else unk_token for word in sentence]
    
    # To indeces
    sentence_as_int = [embedder.vocab.get(word).index for word in filled_sentence]

    # X, Y
    return sentence_as_int, row[0]

In [11]:
import multiprocessing as mp

In [None]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_test = pool.imap(preprocess, test)

with open(prep_test_path, "w") as out_file:
    for X, Y in processed_test:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

del test, processed_test

In [12]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_val = pool.imap(preprocess, val)

with open(prep_val_path, "w") as out_file:
    for X, Y in processed_val:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

del val, processed_val

4
CPU times: user 1min 1s, sys: 9.98 s, total: 1min 11s
Wall time: 1min 35s


In [13]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_train = pool.imap(preprocess, train)

with open(prep_train_path, "w") as out_file:
    for X, Y in processed_train:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

del train, processed_train

4
CPU times: user 8min 59s, sys: 1min 17s, total: 10min 17s
Wall time: 13min 36s


In [14]:
del embedding_model

In [16]:
import gc

In [17]:
gc.collect()

88

# Word2Vec Specific

In [None]:
tokenizerBert._pad_token

In [None]:
tokenizerBert._unk_token

In [None]:
%%time
import gensim
import numpy as np
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=2_000_000)
embedding_model.add(tokenizerBert._unk_token, np.mean(embedding_model.vectors, axis=0),replace=False)
embedding_model.add(tokenizerBert._pad_token, np.zeros(300),replace=False)

In [None]:
def preprocess_word2vec(row, unk_token=tokenizerBert._unk_token, pad_token=tokenizerBert._pad_token):
    #print(type(row[1].tolist()))
    #print(row[1].tolist())
    sentence = row[1][1:].tolist()
    label = row[1][0]
    #print(sentence)
    filled_sentence = [word if embedding_model.vocab.get(word) is not None else unk_token for word in sentence]
    #print(filled_sentence)
    sentence_as_int = [embedding_model.vocab.get(word).index for word in filled_sentence]
    return sentence_as_int, label # X, Y

In [None]:
embedding_model.vocab.get("[UNK]")

unk_token = tokenizerBert._unk_token
for entry in data_p.head(100_000).iterrows():
    sentence = entry[1][1:]
    filled_sentence = [word if embedding_model.vocab.get(word) is not None else unk_token for word in sentence]

In [None]:
%%time
pool = mp.Pool(mp.cpu_count())
print(mp.cpu_count())

# Parallelizing using Pool.map()
results = pool.map(preprocess_word2vec, data_p.iterrows())

pool.close()

In [None]:
data_pw2v = pd.DataFrame(results)