In [1]:
from keras.layers import TextVectorization
import pandas as pd
import numpy as np
import pickle
import random

In [2]:
train = pd.read_csv('data/train.csv')

with open('data/texts_cleaned.pkl', 'rb') as fp:
    texts = pickle.load(fp)

In [3]:
df = pd.DataFrame(texts, columns=['text'])
df['target'] = train.target
df

Unnamed: 0,text,target
0,our deeds reason # earthquake may allah forgiv...,1
1,forest fire near la range ask canada,1
2,all residents asked shelter place notified off...,1
3,"13,000 people receive # wildfires evacuation o...",1
4,just got sent photo ruby # alaska smoke # wild...,1
...,...,...
7608,two giant cranes holding bridge collapse nearb...,1
7609,aria_ahrary thetawniest the control wild fires...,1
7610,m1.94 01:04 ut ? him i volcano hawaii,1
7611,police investigating bike collided car little ...,1


In [4]:
disaster = df[df.target == 1].reset_index(drop=True)
disaster.shape

(3271, 2)

In [5]:
not_disaster = df[df.target == 0].reset_index(drop=True)
not_disaster.shape

(4342, 2)

In [6]:
indexes = random.sample(range(0, not_disaster.shape[0]), not_disaster.shape[0] - disaster.shape[0])
not_disaster.drop(index=indexes, inplace=True)
not_disaster.index = range(disaster.shape[0], disaster.shape[0] + not_disaster.shape[0])
not_disaster

Unnamed: 0,text,target
3271,i love fruits,0
3272,summer lovely,0
3273,my car fast,0
3274,what goooooooaaaaaal ! ! ! ! ! !,0
3275,london cool,0
...,...,...
6537,three days work i have pretty much wrecked hah...,0
6538,engineered great atmosphere british lion gig t...,0
6539,cramer tiger is 3 words wrecked disney is stoc...,0
6540,these boxes ready explode ! exploding kittens ...,0


In [7]:
train = pd.concat([disaster, not_disaster], axis=0)
train = train.sample(frac=1, random_state=0).reset_index(drop=True)
train

Unnamed: 0,text,target
0,# breaking # news call tasmania is emergency s...,1
1,amp rare occasion i go i am complete obliterat...,0
2,do not miss chris # appy is detonation myths o...,1
3,mom we not get home fast wished why ? mom ther...,0
4,news refuge oil spill may costlier bigger proj...,1
...,...,...
6537,by replacing fear unknown curiosity open toû_,0
6538,suicide bomber kills 15 saudi security site mo...,1
6539,be careful anyone lives west beaverton forest ...,1
6540,'calgarians stunned storm insurance companies ...,1


In [8]:
texts = train.text
texts

0       # breaking # news call tasmania is emergency s...
1       amp rare occasion i go i am complete obliterat...
2       do not miss chris # appy is detonation myths o...
3       mom we not get home fast wished why ? mom ther...
4       news refuge oil spill may costlier bigger proj...
                              ...                        
6537       by replacing fear unknown curiosity open toû_
6538    suicide bomber kills 15 saudi security site mo...
6539    be careful anyone lives west beaverton forest ...
6540    'calgarians stunned storm insurance companies ...
6541    remembering mordecai yasuda friedman 24 rama b...
Name: text, Length: 6542, dtype: object

In [9]:
target = train.target
target

0       1
1       0
2       1
3       0
4       1
       ..
6537    0
6538    1
6539    1
6540    1
6541    1
Name: target, Length: 6542, dtype: int64

In [10]:
max_len = len(sorted([text.split(' ') for text in texts], key=lambda sent: len(sent), reverse=True)[0])
max_len

68

In [11]:
max_tokens = 50000

vectorizer = TextVectorization(max_tokens=max_tokens,
                               standardize=None,
                               ngrams=(1, 2),
                               output_sequence_length=max_len)
vectorizer.adapt(texts)

2022-02-02 20:05:39.226397: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-02 20:05:39.226509: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-02-02 20:05:39.255152: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-02-02 20:05:39.294185: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Metal device set to: Apple M1 Pro


In [12]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [13]:
embeddings_index = {}
embedding_dim = 100

with open(f'glove/glove.6B.{embedding_dim}d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [14]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9813 words (40187 misses)


In [15]:
X = np.array(texts)
y = train.target.to_numpy()

In [16]:
np.save('data/X.npy', X)
np.save('data/y.npy', y)
np.save('data/embedding_matrix.npy', embedding_matrix)
np.save('data/num_tokens.npy', num_tokens)
np.save('data/embedding_dim.npy', embedding_dim)
np.save('data/max_tokens.npy', max_tokens)
np.save('data/max_len.npy', max_len)