In [28]:
from keras.layers import TextVectorization
import pandas as pd
import numpy as np
import pickle
import random

In [29]:
train = pd.read_csv('data/train.csv')

with open('data/texts_cleaned.pkl', 'rb') as fp:
    texts = pickle.load(fp)

In [30]:
df = pd.DataFrame(texts, columns=['text'])
df['target'] = train.target
df

Unnamed: 0,text,target
0,our deeds reason # earthquake may allah forgiv...,1
1,forest fire near la range ask canada,1
2,all residents asked shelter place notified off...,1
3,"13,000 people receive # wildfires evacuation o...",1
4,just got sent photo ruby # alaska smoke # wild...,1
...,...,...
7608,two giant cranes holding bridge collapse nearb...,1
7609,aria_ahrary thetawniest the control wild fires...,1
7610,m1.94 01:04 ut ? him i volcano hawaii,1
7611,police investigating bike collided car little ...,1


In [31]:
disaster = df[df.target == 1].reset_index(drop=True)
disaster.shape

(3271, 2)

In [32]:
not_disaster = df[df.target == 0]
not_disaster.shape

(4342, 2)

In [33]:
random.seed(0)

duplicates = pd.DataFrame([disaster.iloc[random.randint(0, disaster.shape[0]-1), ] for _ in range(not_disaster.shape[0] - disaster.shape[0])])
duplicates.index = range(disaster.shape[0], disaster.shape[0] + duplicates.shape[0])
duplicates

Unnamed: 0,text,target
3271,selmoooooo _edvinnn imtariik dzafic_haris elmo...,1
3272,firefighters connecticut headed california fig...,1
3273,ready close ... error nope ! ! # notgoingoutin...,1
3274,alleged east bay serial arsonist arrested,1
3275,pam_palmater agree perrybellegarde get amp vot...,1
...,...,...
4337,archer weather it i it an unrelenting dangerou...,1
4338,no # news # hostages # libya # india # terrori...,1
4339,crptotech tsunami banks # banking # tech # bit...,1
4340,katunews # sri remains closed brush fire burns...,1


In [34]:
disaster = pd.concat([disaster, duplicates], axis=0)
disaster

Unnamed: 0,text,target
0,our deeds reason # earthquake may allah forgiv...,1
1,forest fire near la range ask canada,1
2,all residents asked shelter place notified off...,1
3,"13,000 people receive # wildfires evacuation o...",1
4,just got sent photo ruby # alaska smoke # wild...,1
...,...,...
4337,archer weather it i it an unrelenting dangerou...,1
4338,no # news # hostages # libya # india # terrori...,1
4339,crptotech tsunami banks # banking # tech # bit...,1
4340,katunews # sri remains closed brush fire burns...,1


In [35]:
not_disaster.index = range(disaster.shape[0], disaster.shape[0] + not_disaster.shape[0])
not_disaster

Unnamed: 0,text,target
4342,what is man ?,0
4343,i love fruits,0
4344,summer lovely,0
4345,my car fast,0
4346,what goooooooaaaaaal ! ! ! ! ! !,0
...,...,...
8679,engineered great atmosphere british lion gig t...,0
8680,cramer tiger is 3 words wrecked disney is stoc...,0
8681,these boxes ready explode ! exploding kittens ...,0
8682,sirens everywhere !,0


In [36]:
train = pd.concat([disaster, not_disaster], axis=0)
train = train.sample(frac=1, random_state=0).reset_index(drop=True)
train

Unnamed: 0,text,target
0,wreckage conclusively confirmed from mh370 mal...,1
1,purple heart vet finds jihad threat his car ma...,0
2,the fallacy steam roller it is object whether ...,0
3,afghanistan un reports 'record-high levels civ...,1
4,is bad say i am kind of afraid storms storm ? ...,1
...,...,...
8679,noches el-bestia alexis_sanchez happy see team...,0
8680,sinking carb consultative assembly plans could...,0
8681,sexual revolution blight for women ! stories v...,0
8682,suicide bomber kills 15 saudi security site mo...,1


In [37]:
texts = train.text
texts

0       wreckage conclusively confirmed from mh370 mal...
1       purple heart vet finds jihad threat his car ma...
2       the fallacy steam roller it is object whether ...
3       afghanistan un reports 'record-high levels civ...
4       is bad say i am kind of afraid storms storm ? ...
                              ...                        
8679    noches el-bestia alexis_sanchez happy see team...
8680    sinking carb consultative assembly plans could...
8681    sexual revolution blight for women ! stories v...
8682    suicide bomber kills 15 saudi security site mo...
8683    remembering mordecai yasuda friedman 24 rama b...
Name: text, Length: 8684, dtype: object

In [38]:
target = train.target
target

0       1
1       0
2       0
3       1
4       1
       ..
8679    0
8680    0
8681    0
8682    1
8683    1
Name: target, Length: 8684, dtype: int64

In [3]:
max_len = len(sorted([text.split(' ') for text in texts], key=lambda sent: len(sent), reverse=True)[0])
max_len

68

In [4]:
max_tokens = 50000

vectorizer = TextVectorization(max_tokens=max_tokens,
                               standardize=None,
                               ngrams=(1, 2),
                               output_sequence_length=max_len)
vectorizer.adapt(texts)

2022-02-02 17:45:03.944088: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-02 17:45:03.944202: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-02-02 17:45:03.994097: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-02-02 17:45:04.043225: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Metal device set to: Apple M1 Pro


In [5]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
vectorizer.

In [6]:
embeddings_index = {}
embedding_dim = 100

with open(f'glove/glove.6B.{embedding_dim}d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [7]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9901 words (40099 misses)


In [8]:
X = np.array(texts)
y = train.target.to_numpy()

In [9]:
np.save('data/X.npy', X)
np.save('data/y.npy', y)
np.save('data/embedding_matrix.npy', embedding_matrix)
np.save('data/num_tokens.npy', num_tokens)
np.save('data/embedding_dim.npy', embedding_dim)
np.save('data/max_tokens.npy', max_tokens)
np.save('data/max_len.npy', max_len)