In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove6b100dtxt/glove.6B.100d.txt
/kaggle/input/utils-courseera/Emoji_v3a.ipynb
/kaggle/input/utils-courseera/emo_utils.py
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df['text'].head(10)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
5    #RockyFire Update => California Hwy. 20 closed...
6    #flood #disaster Heavy rain causes flash flood...
7    I'm on top of the hill and I can see a fire in...
8    There's an emergency evacuation happening now ...
9    I'm afraid that the tornado is coming to our a...
Name: text, dtype: object

In [4]:
#Checking the first sentence
train_df['text'][0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [5]:
#Checking the keywords
train_df[train_df['keyword'].notnull()].head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [6]:
#Reading the first sentence
train_df['text'][31],train_df['text'][33]

('@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C',
 '#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi')

In [7]:
from nltk.tokenize import word_tokenize
word_tokenize(train_df['text'][31])

['@',
 'bbcmtd',
 'Wholesale',
 'Markets',
 'ablaze',
 'http',
 ':',
 '//t.co/lHYXEOHY6C']

Tokenize will not help, we will have to remove manually the http things.

In [8]:
Text = train_df.text.str.split()

In [9]:
Text[31]

['@bbcmtd', 'Wholesale', 'Markets', 'ablaze', 'http://t.co/lHYXEOHY6C']

In [10]:
#Removing the words starting withand http
import re
for sentence in Text:
    for word in sentence:
        if re.search("http*", word):
            sentence.remove(word)

In [11]:
Text[31]

['@bbcmtd', 'Wholesale', 'Markets', 'ablaze']

Removing all the special characters

In [12]:
for i in range(len(Text)):
    for j in range(len(Text[i])):
        Text[i][j] = re.sub(r"[?|$|:|'|@|#|=|>|<|.|!]",r'',Text[i][j])
        Text[i][j] = Text[i][j].lower()

In [13]:
Text[31]

['bbcmtd', 'wholesale', 'markets', 'ablaze']

In [14]:
train_df.text[45],Text[45]

('I gained 3 followers in the last week. You? Know your stats and grow with http://t.co/TIyUliF5c6',
 ['i',
  'gained',
  '3',
  'followers',
  'in',
  'the',
  'last',
  'week',
  'you',
  'know',
  'your',
  'stats',
  'and',
  'grow',
  'with'])

Removing numbers

In [15]:
for i in range(len(Text)):
    for j in range(len(Text[i])):
        Text[i][j] = re.sub(r'[0-9]',r'',Text[i][j])

In [16]:
train_df.text[45],Text[45]

('I gained 3 followers in the last week. You? Know your stats and grow with http://t.co/TIyUliF5c6',
 ['i',
  'gained',
  '',
  'followers',
  'in',
  'the',
  'last',
  'week',
  'you',
  'know',
  'your',
  'stats',
  'and',
  'grow',
  'with'])

In [17]:
#Removing the empty strings
for sentence in Text:
    for word in sentence:
        if word == '':
            sentence.remove(word)

In [18]:
train_df.text[45],Text[45]

('I gained 3 followers in the last week. You? Know your stats and grow with http://t.co/TIyUliF5c6',
 ['i',
  'gained',
  'followers',
  'in',
  'the',
  'last',
  'week',
  'you',
  'know',
  'your',
  'stats',
  'and',
  'grow',
  'with'])

In [19]:
train_df['text'] = Text

In [20]:
from shutil import copyfile
copyfile(src = "../input/utils-courseera/emo_utils.py", dst = "../working/emo_utils.py")
from emo_utils import *

In [21]:
#importing a word_to_vec_map function given by DeepLearning.ai
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../input/glove6b100dtxt/glove.6B.100d.txt')

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df['text'],train_df['target'], stratify=train_df['target'])

In [23]:
maxLen = 0
for sentence in train_df['text']:
    if maxLen<len(sentence):
        maxLen = len(sentence)

In [24]:
Text[31]

['bbcmtd', 'wholesale', 'markets', 'ablaze']

In [61]:
vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
any_word = list(word_to_vec_map.keys())[0]
emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
# Initialize the embedding matrix as a numpy array of zeros.

emb_matrix = np.zeros([vocab_size,emb_dim])
for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(vocab_size, emb_dim ,trainable = False)
# Build the embedding layer, it is required before setting the weights of the embedding layer. 

embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
embedding_layer.set_weights([emb_matrix])
    

In [59]:
print("weights[0][1][1] =", embedding_layer.get_weights()[0][1][1])
print("Input_dim", embedding_layer.input_dim)
print("Output_dim",embedding_layer.output_dim)

weights[0][1][1] = 0.49351
Input_dim 400001
Output_dim 100


In [62]:
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation,Embedding

from tensorflow.keras.models import Model

def disaster_classifier(input_shape):
    sentence_indices  = Input(shape=input_shape,dtype='int32')
    embeddings = embedding_layer(sentence_indices)

    X = LSTM(128, return_sequences=True)(embeddings)

    X = Dropout(0.6)(X)

    X = LSTM(128, return_sequences=False)(X)

    X = Dropout(0.6)(X)

    X = Dense(1, activation='sigmoid')(X)

    model = Model(inputs=sentence_indices, outputs=X)

    return model

In [66]:
#It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
sentence_indices = Input(shape=(maxLen,),dtype='int32')
# Propagate sentence_indices through your embedding layer
embeddings = embedding_layer(sentence_indices) 
model = disaster_classifier((maxLen,))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        [(None, 31)]              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 31, 100)           40000100  
_________________________________________________________________
lstm_14 (LSTM)               (None, 31, 128)           117248    
_________________________________________________________________
dropout_14 (Dropout)         (None, 31, 128)           0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129 

In [53]:
m = train_df['text'].shape[0] 
X_indices = np.zeros([m,maxLen])
for i in range(m): 
    j = 0
    for w in train_df['text'][i]:
                # if w exists in the word_to_index dictionary
                if w in word_to_index:
                    # Set the (i,j)th entry of X_indices to the index of the correct word.
                    X_indices[i, j] = word_to_index[w]
                    # Increment j to j + 1
                    j =  j+1
            

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_indices, train_df['target'], test_size=0.33, random_state=42)

In [70]:
X_train.shape,X_test.shape

((5100, 31), (2513, 31))

In [54]:
m

7613

In [55]:
X_indices.shape[0]

7613

In [71]:
model.fit(X_train, y_train, epochs = 25, batch_size = 32, shuffle=True)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f6446265650>

In [118]:
from sklearn.metrics import classification_report
print(classification_report(model.predict(X_test)>0.5, y_test))

              precision    recall  f1-score   support

       False       0.89      0.89      0.89      1455
        True       0.85      0.86      0.85      1058

    accuracy                           0.88      2513
   macro avg       0.87      0.87      0.87      2513
weighted avg       0.88      0.88      0.88      2513



In [72]:
loss, acc = model.evaluate(X_test, y_test)



It gives 0.87 dev set accuracy which is reasonable.

In [73]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [74]:
Text_test = test_df.text.str.split()

In [75]:
#Removing the words starting withand http
for sentence in Text_test:
    for word in sentence:
        if re.search("http*", word):
            sentence.remove(word)

In [77]:
for i in range(len(Text_test)):
    for j in range(len(Text_test[i])):
        Text_test[i][j] = re.sub(r'[0-9]',r'',Text_test[i][j])

In [79]:
for i in range(len(Text_test)):
    for j in range(len(Text_test[i])):
        Text_test[i][j] = re.sub(r"[?|$|:|'|@|#|=|>|<|.|!]",r'',Text_test[i][j])
        Text_test[i][j] = Text_test[i][j].lower()

In [81]:
#Removing the empty strings
for sentence in Text_test:
    for word in sentence:
        if word == '':
            sentence.remove(word)

Taking 3 random sentences

In [82]:
Text_test.sample(3)

682     [the, chemical, brothers, to, play, the, armor...
2965    [rt, mme_austin, why, marijuana, is, critical,...
543     [im, security, so, they, want, me, to, help, o...
Name: text, dtype: object

In [83]:
test_df['text'] = Text_test

maxLen should remains the same as same number of words are assumed in train and test. But it should not be more than assumed previously

In [84]:
maxLen1 = 0
for sentence in test_df['text']:
    if maxLen1<len(sentence):
        maxLen1 = len(sentence)

In [85]:
maxLen1

31

It is also same. We can continue with our assumption

In [86]:
m1 = test_df['text'].shape[0] 

X_indices_test = np.zeros([m1,maxLen])
for i in range(m1): 
    j = 0
    for w in test_df['text'][i]:
                # if w exists in the word_to_index dictionary
                if w in word_to_index:
                    # Set the (i,j)th entry of X_indices to the index of the correct word.
                    X_indices_test[i, j] = word_to_index[w]
                    # Increment j to j + 1
                    j =  j+1
            

In [119]:
submission = model.predict(X_indices_test)

In [120]:
submission

array([[0.96946204],
       [0.9305117 ],
       [0.47127345],
       ...,
       [0.9959446 ],
       [0.96757144],
       [0.07990125]], dtype=float32)

In [121]:
submission>0.5

array([[ True],
       [ True],
       [False],
       ...,
       [ True],
       [ True],
       [False]])

In [116]:
test_df['text'][3261]

0               [just, happened, a, terrible, car, crash]
1       [heard, about, earthquake, is, different, citi...
2       [there, is, a, forest, fire, at, spot, pond,, ...
3              [apocalypse, lighting, spokane, wildfires]
4       [typhoon, soudelor, kills, in, china, and, tai...
                              ...                        
3258    [earthquake, safety, los, angeles, ûò, safety...
3259    [storm, in, ri, worse, than, last, hurricane, ...
3260               [green, line, derailment, in, chicago]
3261    [meg, issues, hazardous, weather, outlook, (hwo)]
3262    [cityofcalgary, has, activated, its, municipal...
Name: text, Length: 3263, dtype: object

In [112]:
submission_df = pd.DataFrame(test_df["id"], columns=["id"])
submission_df["target"] = submission

In [113]:
submission_df.to_csv('submission.csv', header=True, index=False)