# Improved Pre-Processing

In [1]:
!ls ../src/data/original

readme.txt  test.csv  train.csv


In [2]:
orig_train_path =  u"../src/data/original/train.csv"
orig_test_path = u"../src/data/original/test.csv"
orig_output_path =  u"../src/data/processed/data.csv"

In [3]:
import pandas as pd
import gc
import csv

In [4]:
#%%time

#with open(orig_train_path) as csv_file:
#    csv_reader = csv.reader(csv_file, delimiter=',')
#    train_orig = [(int(row[0]), row[2]) for row in csv_reader]
#    del csv_reader
    
#with open(orig_test_path) as csv_file:
#    csv_reader = csv.reader(csv_file, delimiter=',')
#    test_orig = [(int(row[0]), row[2]) for row in csv_reader]
#    del csv_reader

In [5]:
%%time
train = pd.read_csv(orig_train_path, header=None)
test = pd.read_csv(orig_test_path, header=None)

data = pd.concat([train, test])

del train, test

data.columns = ["label", "title", "review"]
data.drop(columns=['title'], inplace=True)
data["label"] = data["label"] - 1
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data = data.values.tolist()

CPU times: user 24.5 s, sys: 1.31 s, total: 25.8 s
Wall time: 22.1 s


In [6]:
#data = train_orig + test_orig
#del train_orig
#del test_orig
#print(len(data))
#data[0]

In [7]:
from transformers import BertTokenizer

In [8]:
%%time
tokenizerBert = BertTokenizer.from_pretrained('bert-base-uncased')

CPU times: user 43.5 ms, sys: 12 ms, total: 55.5 ms
Wall time: 603 ms


In [9]:
%%time
import gensim
import numpy as np
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=1_000_000)
embedding_model.add(tokenizerBert._unk_token, np.mean(embedding_model.vectors, axis=0),replace=False)
embedding_model.add(tokenizerBert._pad_token, np.zeros(300),replace=False)

CPU times: user 31.2 s, sys: 2.3 s, total: 33.5 s
Wall time: 32.3 s


In [10]:
def preprocess(row, padding=200,
               tokenizer=tokenizerBert,
               embedder=embedding_model,
               unk_token=tokenizerBert._unk_token,
               pad_token=tokenizerBert._pad_token):

    # row = [label, review]
    
    # Tokenize
    sentence = tokenizer.tokenize(row[1][:padding])

    # Pad
    sentence = sentence + [pad_token]*(padding - len(sentence))
    
    # Unknown words
    filled_sentence = [word if embedder.vocab.get(word) is not None else unk_token for word in sentence]
    
    # To indeces
    sentence_as_int = [embedder.vocab.get(word).index for word in filled_sentence]

    # X, Y
    return sentence_as_int, row[0]

In [11]:
import multiprocessing as mp

In [12]:
%%time
cores = max(1, round(mp.cpu_count() / 2))
#cores = 8
print(cores)
pool = mp.Pool(cores)

# Parallelizing, will work as long as the processing is not too fast and fillst the memory :o
processed_data = pool.imap(preprocess, data)

with open("data_p-w2v_e-1m_p-200.txt", "w") as out_file:
    for X, Y in processed_data:
        stringified = [str(entry) for entry in [Y] + X]
        out_file.write(",".join(stringified) + "\n")
    
pool.close()
pool.join()

#data_p = list(map(preprocess, data[:1_000_000]))

#while True:
#    print(pool._cache)
#    time.sleep(5)
    
#print("Before Join")
    
#pool.close()
#pool.join()

#print("After Join")

#res = data_p.get()

#len(res)

4
CPU times: user 9min 51s, sys: 1min 25s, total: 11min 17s
Wall time: 16min 50s


In [13]:
del processed_data

In [14]:
import gc

In [15]:
gc.collect()

66

# Word2Vec Specific

In [None]:
tokenizerBert._pad_token

In [None]:
tokenizerBert._unk_token

In [None]:
%%time
import gensim
import numpy as np
embedding_model = gensim.models.KeyedVectors.load_word2vec_format('../src/data/embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=2_000_000)
embedding_model.add(tokenizerBert._unk_token, np.mean(embedding_model.vectors, axis=0),replace=False)
embedding_model.add(tokenizerBert._pad_token, np.zeros(300),replace=False)

In [None]:
def preprocess_word2vec(row, unk_token=tokenizerBert._unk_token, pad_token=tokenizerBert._pad_token):
    #print(type(row[1].tolist()))
    #print(row[1].tolist())
    sentence = row[1][1:].tolist()
    label = row[1][0]
    #print(sentence)
    filled_sentence = [word if embedding_model.vocab.get(word) is not None else unk_token for word in sentence]
    #print(filled_sentence)
    sentence_as_int = [embedding_model.vocab.get(word).index for word in filled_sentence]
    return sentence_as_int, label # X, Y

In [None]:
embedding_model.vocab.get("[UNK]")

unk_token = tokenizerBert._unk_token
for entry in data_p.head(100_000).iterrows():
    sentence = entry[1][1:]
    filled_sentence = [word if embedding_model.vocab.get(word) is not None else unk_token for word in sentence]

In [None]:
%%time
pool = mp.Pool(mp.cpu_count())
print(mp.cpu_count())

# Parallelizing using Pool.map()
results = pool.map(preprocess_word2vec, data_p.iterrows())

pool.close()

In [None]:
data_pw2v = pd.DataFrame(results)