In [10]:
from keras.layers import TextVectorization
import pandas as pd
import numpy as np
import pickle

In [11]:
train = pd.read_csv('data/train.csv')

with open('data/texts_cleaned.pkl', 'rb') as fp:
    texts = pickle.load(fp)

In [12]:
max_len = len(sorted([text.split(' ') for text in texts], key=lambda sent: len(sent), reverse=True)[0])
max_len

68

In [13]:
max_tokens = 50000

vectorizer = TextVectorization(max_tokens=max_tokens,
                               standardize=None,
                               ngrams=(1, 2),
                               output_sequence_length=max_len)
vectorizer.adapt(texts)

2022-02-02 15:01:34.008757: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [14]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [15]:
embeddings_index = {}
embedding_dim = 100

with open(f'glove/glove.6B.{embedding_dim}d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [16]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9901 words (40099 misses)


In [17]:
X = np.array(texts)
y = train.target.to_numpy()

In [18]:
np.save('data/X.npy', X)
np.save('data/y.npy', y)
np.save('data/embedding_matrix.npy', embedding_matrix)
np.save('data/num_tokens.npy', num_tokens)
np.save('data/embedding_dim.npy', embedding_dim)
np.save('data/max_tokens.npy', max_tokens)
np.save('data/max_len.npy', max_len)