In [36]:
import pandas as pd
df = pd.read_csv('train.csv')

print( df.target.value_counts() )
df.head()

0    4342
1    3271
Name: target, dtype: int64


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [37]:
raw_tweets = list(df['text'])

raw_tweets[:4]

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ']

In [38]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_vocab = 10000 # max number of words to have in our vocabulary
max_seq_length = 15 # max length our sequences will be 

text_vectorizer = TextVectorization(
    max_tokens = max_vocab,
    # standardize="lower_and_strip_punctuation",
    output_mode = "int",
    output_sequence_length = max_seq_length
)

In [39]:
text_vectorizer.adapt(raw_tweets)

vocabulary = text_vectorizer.get_vocabulary()
text_vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']

In [40]:
proc_tweets = text_vectorizer(raw_tweets)

print(proc_tweets[:5])

tf.Tensor(
[[ 110 6241   22    2  835    6   19  250  125 1689 5990   70   40    0
     0]
 [ 180   42  222  694 9621 9422 1434    0    0    0    0    0    0    0
     0]
 [  40 1739 1685    5 2150    4  663   22  123    1   18 1759   41  428
   249]
 [3475   56 5118 1353  249 1388    4   88    0    0    0    0    0    0
     0]
 [  29   95 1193   19  327   20 9557 2364   26  257   20 1353    1   66
     3]], shape=(5, 15), dtype=int64)


In [41]:
# checking if the vectorized text is decoded properly

for seq in proc_tweets[0]:
    print(f"{int(seq)} --> {vocabulary[seq]}")

110 --> our
6241 --> deeds
22 --> are
2 --> the
835 --> reason
6 --> of
19 --> this
250 --> earthquake
125 --> may
1689 --> allah
5990 --> forgive
70 --> us
40 --> all
0 --> 
0 --> 


In [42]:
targets = list(df['target'])
targets[:20]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [43]:
# train_x = []

# for tweet in proc_tweets:
#     bag = []
#     for word in tweet:
#         bag.append(int(word))
#     train_x.append(bag)

# train_x[:4]

In [44]:
import numpy as np

train_x = np.array(proc_tweets)
train_y = np.array(targets)

In [45]:
from tensorflow.keras import layers
from tensorflow.keras.models import Model

visible = layers.Input( shape=[max_seq_length] )
x = layers.Embedding(
    input_dim = max_vocab,
    output_dim = 128,
    input_length = max_seq_length,
)(visible)
# x = layers.GlobalAveragePooling1D()(x)
x = layers.Bidirectional( layers.LSTM(64) )(x)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
output = layers.Dense(1,activation="sigmoid")(x)

model = Model( inputs=[visible], outputs=[output] )

In [46]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(
    train_x, train_y,
    validation_split=0.05,
    epochs=2,
)


Epoch 1/2
Epoch 2/2


In [47]:
# import matplotlib.pyplot as plt

# plt.plot( history.history['loss'] )
# plt.plot( history.history['val_loss'] )
# plt.show()

In [48]:
test_df = pd.read_csv('test.csv')

test_tweets = list(test_df['text'])

In [49]:
proc_test = text_vectorizer(test_tweets)
proc_test[:3]

<tf.Tensor: shape=(3, 15), dtype=int64, numpy=
array([[  29,  893,    3, 1910,  122,   85,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 450,   53,  250,    9, 1159, 2754,  592, 1940,  228,    0,    0,
           0,    0,    0,    0],
       [  75,    9,    3,  180,   42,   17,  767, 3034,    1,   22, 6003,
         865,    2,  712,    8]], dtype=int64)>

In [50]:
# test_x = []

# for tweet in proc_test:
#     bag = []
#     for word in tweet:
#         bag.append(int(word))
#     test_x.append(bag)

# test_x[:4]

In [51]:
predictions = model.predict(proc_test)

for i in range(len(predictions)):
    if i < 30:
        print(f"{predictions[i][0]//0.01} % distress --> {test_tweets[i]}")

84.0 % distress --> Just happened a terrible car crash
54.0 % distress --> Heard about #earthquake is different cities, stay safe everyone.
93.0 % distress --> there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all
42.0 % distress --> Apocalypse lighting. #Spokane #wildfires
98.0 % distress --> Typhoon Soudelor kills 28 in China and Taiwan
88.0 % distress --> We're shaking...It's an earthquake
8.0 % distress --> They'd probably still show more life than Arsenal did yesterday, eh? EH?
11.0 % distress --> Hey! How are you?
7.0 % distress --> What a nice hat?
13.0 % distress --> Fuck off!
9.0 % distress --> No I don't like cold!
25.0 % distress --> NOOOOOOOOO! Don't do that!
18.0 % distress --> No don't tell me that!
15.0 % distress --> What if?!
11.0 % distress --> Awesome!
62.0 % distress --> Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
16.0 % distress --> @sunkxssedharr