In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
Y = train.target.copy()

In [9]:
X = train[[el for el in train.keys() if el != "target"]]

In [31]:
X

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,10872,,,Police investigating after an e-bike collided ...


In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.075,random_state=42)

In [15]:
X_train.text.apply(lambda t: len(t.split())).quantile([.8,.85,.90,.95])

0.80    20.0
0.85    21.0
0.90    22.0
0.95    24.0
Name: text, dtype: float64

In [16]:
tokenizer = Tokenizer(num_words=1500,oov_token="<OOV>")

In [17]:
tokenizer.fit_on_texts(X.text)

In [18]:
train_sequences = tokenizer.texts_to_sequences(X_train.text)
test_sequences = tokenizer.texts_to_sequences(X_test.text)

In [26]:
train_sequences_padded = pad_sequences(train_sequences,maxlen=22,truncating='post',padding='post')
test_sequences_padded = pad_sequences(test_sequences,maxlen=22,truncating='post',padding='post')

In [29]:
#lets standardize it
MEAN = train_sequences_padded.mean()
STD = train_sequences_padded.std()

In [30]:
train_sequences_standardized = (train_sequences_padded-MEAN)/STD
test_sequences_standardized = (test_sequences_padded-MEAN)/STD

In [33]:
print(train_sequences_standardized.shape,y_train.shape)

(7042, 22) (7042,)


In [56]:
#build basic model
dense_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(22,1)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(2048,activation='relu'),
    tf.keras.layers.Dense(1024,activation='relu'),
    tf.keras.layers.Dense(512,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [57]:
dense_model.compile(optimizer="Adam",loss=tf.keras.losses.binary_crossentropy,metrics=['acc'])

In [58]:
dense_model.fit(train_sequences_standardized,y_train,batch_size = 512,epochs=100,validation_data=(test_sequences_standardized,y_test))



<keras.callbacks.History at 0x1ffacb8bf10>

In [69]:
num_of_dims = 10
embedded_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(1500,num_of_dims,input_length=22),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [70]:
embedded_model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['acc'])

In [71]:
embedded_model.fit(train_sequences_padded,y_train,epochs=10,validation_data=(test_sequences_padded,y_test))

Epoch 10/10


<keras.callbacks.History at 0x1ffade71df0>