In [1]:
import pandas as pd
import numpy as np 

In [2]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_id = train['id']
test_id = test['id']

In [4]:
train.drop(columns = ['id'], inplace = True)
test.drop(columns = ['id'], inplace = True)

Location column has a lot of missing values and keyword column contains information that is present in the text column and we will be exracting that information down the line from the text column

In [5]:
train.isnull().sum()

keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
test.isnull().sum()

keyword       26
location    1105
text           0
dtype: int64

In [7]:
train.drop(columns = ['keyword','location'], inplace = True)
test.drop(columns = ['keyword','location'], inplace = True)

In [8]:
train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


## Data Cleaning

In [9]:
# Converting all text to lowercase
train['text'] = [t.lower() for t in train['text']]
test['text'] = [t.lower() for t in test['text']]

In [10]:
# Removing punctuations
import re
import string
train['text'] = [re.sub('[%s]' % re.escape(string.punctuation), '', i) for i in train['text']]
test['text'] = [re.sub('[%s]' % re.escape(string.punctuation), '', i) for i in test['text']]

In [11]:
# Removing numeric characters
train['text'] = [re.sub('\d','',n) for n in train['text']]
test['text'] = [re.sub('\d','',n) for n in test['text']]

## Preprocessing Text Data

### Tokenization and Stop Words

In [12]:
import nltk
from nltk.tokenize import word_tokenize

In [13]:
# Word Tokenization

train['text'] = [word_tokenize(i) for i in train['text']]
test['text'] = [word_tokenize(i) for i in test['text']]

In [14]:
train['text'].head()

0    [our, deeds, are, the, reason, of, this, earth...
1        [forest, fire, near, la, ronge, sask, canada]
2    [all, residents, asked, to, shelter, in, place...
3    [people, receive, wildfires, evacuation, order...
4    [just, got, sent, this, photo, from, ruby, ala...
Name: text, dtype: object

In [15]:
# Stop Words Removal

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
train['text'] = [[i for i in j if not i in stop_words] for j in train['text']]
test['text'] = [[i for i in j if not i in stop_words] for j in test['text']]

In [16]:
train.head()

Unnamed: 0,text,target
0,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[residents, asked, shelter, place, notified, o...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


### Lemmatization

In [17]:
from collections import defaultdict
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

tag_map

defaultdict(<function __main__.<lambda>()>, {'J': 'a', 'V': 'v', 'R': 'r'})

In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

train['text'] = [[lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(i)] for i in train['text']]
test['text'] = [[lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(i)] for i in test['text']]

In [20]:
train['lemmatized_text'] = train['text'].apply(lambda x : ' '.join(x))
test['lemmatized_text'] = test['text'].apply(lambda x : ' '.join(x))

In [21]:
train.head()

Unnamed: 0,text,target,lemmatized_text
0,"[deed, reason, earthquake, may, allah, forgive...",1,deed reason earthquake may allah forgive u
1,"[forest, fire, near, la, ronge, sask, canada]",1,forest fire near la ronge sask canada
2,"[resident, ask, shelter, place, notify, office...",1,resident ask shelter place notify officer evac...
3,"[people, receive, wildfire, evacuation, order,...",1,people receive wildfire evacuation order calif...
4,"[get, sent, photo, ruby, alaska, smoke, wildfi...",1,get sent photo ruby alaska smoke wildfires pou...


In [22]:
train.drop(columns = ['text'], inplace = True)
test.drop(columns = ['text'], inplace = True)

### Word Embedding using TF_IDF Vectorizer

In [23]:
# Assigning numbers(vectors) to words in our data
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 5000) 

train_emb = tfidf.fit_transform(train['lemmatized_text']).toarray()
test_emb = tfidf.fit_transform(test['lemmatized_text']).toarray()

In [24]:
train_emb.shape[1:]

(5000,)

In [25]:
y = train['target']

## Model Training

### Multinomial Naive Bayes

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [27]:
MNB = MultinomialNB()

Splitting the train set in train and validation set to see how good is Naive Bayes for our data

In [28]:
x_train,x_valid,y_train,y_valid = train_test_split(train_emb,y,test_size = 0.3, random_state = 100) 

In [29]:
MNB.fit(x_train,y_train)
pred_MNB = MNB.predict(x_valid)

In [30]:
print("Accuracy score : {:.2f}".format(accuracy_score(y_valid, pred_MNB)))

Accuracy score : 0.81


In [31]:
print("ROC-AUC score : {:.2f}".format(roc_auc_score(y_valid, pred_MNB)))

ROC-AUC score : 0.79


In [32]:
print(classification_report(y_valid, pred_MNB))

              precision    recall  f1-score   support

           0       0.79      0.91      0.84      1322
           1       0.84      0.67      0.74       962

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.79      2284
weighted avg       0.81      0.81      0.80      2284



### Model Training

In [33]:
MNB.fit(train_emb,y)

MultinomialNB()

In [34]:
MNB_predictions = MNB.predict(test_emb)

In [35]:
Prediction_results = pd.DataFrame({"target": MNB_predictions}, index = test_id)

In [36]:
#submission_file = Prediction_results.to_csv('submission.csv')

#### Multinomial Naive Bayes got a 0.515 score which is decent but can be significantly improved.

### Support Vector Machines

In [37]:
from sklearn import svm
SVC = svm.SVC()
#SVC.fit(x_train,y_train)
#pred_SVC = SVC.predict(x_valid)

In [38]:
#print("Accuracy score : {:.2f}".format(accuracy_score(y_valid, pred_SVC)))

In [39]:
#print("ROC-AUC score : {:.2f}".format(roc_auc_score(y_valid, pred_SVC)))

## Alternative Approach using Sequencing, Padding, and using RNN(LSTM)

### Sequencing and Sentence Padding

In [40]:
from collections import Counter

# Finding the number of unique word in the corpus
def word_counter(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [41]:
train.head()

Unnamed: 0,target,lemmatized_text
0,1,deed reason earthquake may allah forgive u
1,1,forest fire near la ronge sask canada
2,1,resident ask shelter place notify officer evac...
3,1,people receive wildfire evacuation order calif...
4,1,get sent photo ruby alaska smoke wildfires pou...


In [42]:
train_text = train.lemmatized_text
counter = word_counter(train_text)
counter

Counter({'deed': 2,
         'reason': 31,
         'earthquake': 51,
         'may': 88,
         'allah': 9,
         'forgive': 4,
         'u': 254,
         'forest': 66,
         'fire': 357,
         'near': 55,
         'la': 29,
         'ronge': 1,
         'sask': 1,
         'canada': 13,
         'resident': 8,
         'ask': 28,
         'shelter': 6,
         'place': 35,
         'notify': 1,
         'officer': 37,
         'evacuation': 52,
         'order': 38,
         'expect': 32,
         'people': 199,
         'receive': 3,
         'wildfire': 81,
         'california': 119,
         'get': 435,
         'sent': 5,
         'photo': 65,
         'ruby': 1,
         'alaska': 7,
         'smoke': 53,
         'wildfires': 2,
         'pour': 4,
         'school': 70,
         'rockyfire': 4,
         'update': 53,
         'hwy': 10,
         'close': 32,
         'direction': 15,
         'due': 31,
         'lake': 15,
         'county': 38,
         'cafire

In [43]:
print("Number of unique words in the corpus : {:.2f}".format(len(counter)))

Number of unique words in the corpus : 19694.00


In [44]:
words = len(counter)
# maximum number of words in a sequence
max_length = 20

In [45]:
train_sent = train['lemmatized_text']
train_labels = train['target']
test_sent = test['lemmatized_text']

In [46]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=words)
tokenizer.fit_on_texts(train_sent)

Using TensorFlow backend.


In [47]:
word_index = tokenizer.word_index
word_index

{'get': 1,
 'like': 2,
 'fire': 3,
 'amp': 4,
 'im': 5,
 'go': 6,
 'u': 7,
 'new': 8,
 'via': 9,
 'one': 10,
 'people': 11,
 'say': 12,
 'news': 13,
 'bomb': 14,
 'dont': 15,
 'time': 16,
 'kill': 17,
 'video': 18,
 'make': 19,
 'emergency': 20,
 'come': 21,
 'crash': 22,
 'disaster': 23,
 'flood': 24,
 'burn': 25,
 'body': 26,
 'year': 27,
 'see': 28,
 'attack': 29,
 'look': 30,
 'day': 31,
 'police': 32,
 'home': 33,
 'take': 34,
 'know': 35,
 'family': 36,
 'would': 37,
 'building': 38,
 'still': 39,
 'storm': 40,
 'think': 41,
 'love': 42,
 'back': 43,
 'california': 44,
 'watch': 45,
 'train': 46,
 'suicide': 47,
 'want': 48,
 'world': 49,
 'car': 50,
 'man': 51,
 'life': 52,
 'collapse': 53,
 'bag': 54,
 'death': 55,
 'rt': 56,
 'first': 57,
 'pm': 58,
 'scream': 59,
 'cause': 60,
 'cant': 61,
 'need': 62,
 'nuclear': 63,
 'good': 64,
 'work': 65,
 'war': 66,
 'youtube': 67,
 'two': 68,
 'today': 69,
 'dead': 70,
 'let': 71,
 'accident': 72,
 'wreck': 73,
 'plan': 74,
 'full': 75

In [48]:
train_sequence = tokenizer.texts_to_sequences(train_sent)

In [49]:
train_sequence[0]

[3761, 450, 212, 80, 1337, 2423, 7]

In [50]:
test_sequence = tokenizer.texts_to_sequences(test_sent)

In [68]:
test_sequence[:5]

[[229, 1629, 50, 22],
 [424, 212, 1010, 141, 392, 1712, 201],
 [140, 3, 591, 2906, 2213, 711, 433, 107],
 [374, 270, 6263, 92],
 [447, 652, 17, 400, 1013]]

In [66]:
# Word index contains the index(numeral) assigned to every single word in our data
word_index = tokenizer.word_index
for word, num in word_index.items():
    print(f"{word} -> {num}")
    if num == 15: # words that have been assigned index 1 to 15 (index 0 is reserved)
        break        

get -> 1
like -> 2
fire -> 3
amp -> 4
im -> 5
go -> 6
u -> 7
new -> 8
via -> 9
one -> 10
people -> 11
say -> 12
news -> 13
bomb -> 14
dont -> 15


#### Sequence Padding

In [52]:
from keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(train_sequence, maxlen = max_length, padding = "post", truncating = "post")

In [53]:
train_padded

array([[3761,  450,  212, ...,    0,    0,    0],
       [ 140,    3,  190, ...,    0,    0,    0],
       [1470,  515, 1793, ...,    0,    0,    0],
       ...,
       [3515,  446, 1325, ...,    0,    0,    0],
       [  32,  810, 2670, ...,    0,    0,    0],
       [ 128,   33,  405, ...,    0,    0,    0]], dtype=int32)

In [54]:
test_padded = pad_sequences(test_sequence, maxlen = max_length, padding = "post", truncating = "post")

In [55]:
test_padded

array([[ 229, 1629,   50, ...,    0,    0,    0],
       [ 424,  212, 1010, ...,    0,    0,    0],
       [ 140,    3,  591, ...,    0,    0,    0],
       ...,
       [ 762,  544,  296, ...,    0,    0,    0],
       [5134,  170,  382, ...,    0,    0,    0],
       [4871, 2019, 1326, ...,    0,    0,    0]], dtype=int32)

### Model Building using LSTM

In [56]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam
import tensorflow as tf

def leaky_relu(z, name = None):
    return tf.maximum(0.01*z,z, name = name)

model = Sequential()

model.add(Embedding(words,32,input_length = max_length)) # embedding layer
model.add(LSTM(64, dropout = 0.1)) # RNN layer
model.add(Dense(units = 32 , activation = leaky_relu)) # Dense layer with leaky_relu activation
model.add(Dense(1, activation = tf.nn.elu))

optimizer = Adam(learning_rate = 3e-4)

model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

In [57]:
# Looking at the structure of the network
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            630208    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 657,153
Trainable params: 657,153
Non-trainable params: 0
_________________________________________________________________


In [58]:
# training the model over 40 iterations
model.fit(train_padded, train_labels, epochs = 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fe38c3d5710>

In [59]:
import h5py
#model.save('baseline_lstm_model.h5')

In [60]:
from keras.models import load_model
#model = load_model('baseline_lstm_model.h5')

In [61]:
lstm_base_pred = model.predict_classes(test_padded, verbose = 0)

In [62]:
lstm_base_pred = lstm_base_pred.reshape(-1,1).ravel()

In [64]:
Prediction_results_lstm = pd.DataFrame({"target":lstm_base_pred}, index = test_id)
Prediction_results_lstm

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1


In [65]:
#submission_lstm_elu_leaky_relu = Prediction_results_lstm.to_csv('submission_lstm_elu_leaky_relu.csv')