In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./tweets.csv')

In [3]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [4]:
df = df[['text', 'target']]
df

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...
11365,Media should have warned us well in advance. T...,0
11366,i feel directly attacked 💀 i consider moonbin ...,0
11367,i feel directly attacked 💀 i consider moonbin ...,0
11368,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0


In [5]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [6]:
train_df, test_df = train_test_split(df, train_size = .85, shuffle = True, random_state = 123, stratify = df['target'])

In [9]:
train_x = train_df.text.values
train_y = train_df.target.values

In [10]:
print(train_x, train_y)

['Widow of CIA agent killed in 2009 Afghanistan suicide bomb attack breaks her silence 10 years on https://t.co/GtCOCsBtLx'
 'Staff at Mogo Zoo on the NSW South Coast saved the property, fighting fires burning at the zoo. These gorgeous gibbons were…'
 'But we can see we are not like LWs who always defended Mohammad Afzal Guru who was a Kashmiri terrorist, who was convicted…'
 ...
 'It deeply saddens me to see the negativity that has engulfed our country. It’s just getting worse everyday. 😔'
 "First Sydney then Canberra, now Melbourne - Melbourne's hazardous #AirPollution quality worst in the world overnight due to #A…"
 'Once the fires are out, water will emerge as the next mismanaged catastrophe. Catchment stream flows central north we…'] [1 1 0 ... 0 1 0]


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_x)

In [12]:
len(tokenizer.word_index)

27701

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(train_x)
padded_sequence = pad_sequences(encoded_docs, maxlen = 200)

In [17]:
train_x[0]

'Widow of CIA agent killed in 2009 Afghanistan suicide bomb attack breaks her silence 10 years on https://t.co/GtCOCsBtLx'

In [18]:
encoded_docs[0]

[3254,
 7,
 2085,
 1921,
 149,
 8,
 2287,
 1679,
 180,
 173,
 146,
 1809,
 77,
 1461,
 175,
 92,
 13,
 2,
 3,
 4]

In [19]:
padded_sequence[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [20]:
padded_sequence.shape

(9664, 200)

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, SpatialDropout1D, Embedding
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length = 200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(120, return_sequences = True))
model.add(Dropout(0.3))
model.add(LSTM(40))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])



In [24]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 32)           886464    
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_6 (LSTM)               (None, 200, 50)           16600     
                                                                 
 dropout_4 (Dropout)         (None, 200, 50)           0         
                                                                 
 lstm_7 (LSTM)               (None, 200, 120)          82080     
                                                                 
 dropout_5 (Dropout)         (None, 200, 120)          0         
                                                      

In [26]:
history = model.fit(padded_sequence, train_y, validation_split = 0.2, epochs = 2, batch_size = 32)

Epoch 1/2
Epoch 2/2
