In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**** 1. DATA DESCRIPTION****

Goal - To build a ML model which will classify if a tweet is really about any disaster or not.

Data - 10,000 tweets from twitter that are already pre-tagged.


**2. EDA**

2.1 Let us first load the given data.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from sklearn import model_selection
from IPython.display import display
from collections import defaultdict
from collections import  Counter

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm_notebook

from wordcloud import STOPWORDS, WordCloud
from termcolor import colored


import warnings
warnings.filterwarnings(action="ignore")
tf.__version__

In [None]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

df_train.head()

In [None]:
print ("Length of train data", df_train.shape[0])
print ("Length of test data", df_test.shape[0])

**2.2 Target column distribution**

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
df_train.target.value_counts().plot(kind="pie",
                                           labels=["Disaster(43%)", "Not a Disaster(57%)"],
                                           colors=['lightcoral','lightskyblue'],
                                           fontsize=14,
                                           ylabel="");

plt.subplot(1, 2, 2)
sns.countplot(x="target",data=df_train, palette="RdBu")
plt.show()

**2.3 Word cloud for each of target value**

In [None]:
# Create a list of words for each of the target value by converting their corresponding 
# texts to a list and then removing the stopwords

disaster = df_train[df_train["target"]==1]["text"].tolist()
non_disaster = df_train[df_train["target"]==0]["text"].to_list()
print("disaster",disaster)
print("Non disaster",non_disaster)

In [None]:
def Convert(string):
    words = list(string.lower().split(" "))
    return words

In [None]:
disaster_df = pd.DataFrame(disaster , columns = ["text"])
disaster_df["List of Words"] = disaster_df["text"].map(Convert)

non_disaster_df = pd.DataFrame(non_disaster, columns = ["text"])
non_disaster_df["List of Words"] = non_disaster_df["text"].map(Convert)

In [None]:
### Find the words for disaster and non-disaster after removing the stop words#####
import nltk
stop_words = nltk.corpus.stopwords.words("english")
disaster_words = disaster_df["List of Words"]
disaster_allwords = []
for wordlist in disaster_words:
    for disaster_word in wordlist:
        if disaster_word not in stop_words:
            disaster_allwords.append(disaster_word)


non_disaster_words = non_disaster_df["List of Words"]
non_disaster_allwords = []
for wordlist in non_disaster_words:
    for non_disaster_word in wordlist:
        if non_disaster_word not in stop_words:
            non_disaster_allwords.append(non_disaster_word)

In [None]:
print ("Disaster Words",disaster_allwords)
print("Non-Disaster Words",non_disaster_allwords)

In [None]:
# Let us build the word cloud for each of the target by taking most commonly used top 2500 words.
from nltk.probability import FreqDist
mostcommon_disaster = FreqDist(disaster_allwords).most_common(2500)
wordcloud = WordCloud(width=1800, height=1000, background_color='white').generate(str(mostcommon_disaster))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Most used words for Disaster ', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_nondisaster = FreqDist(disaster_allwords).most_common(2500)
wordcloud = WordCloud(width=1800, height=1000, background_color='white').generate(str(mostcommon_nondisaster))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Most used words for Non - Disaster ', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

From the above word clouds , we can see that the data needs to be cleaned like removing punctuations, url, numbers , emotiocons etc as they will not be helpful. 

**3. DATA CLEANING ******

In [None]:
def clean_dataset(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '',text) #Removes Websites
    text  = re.sub(r'<.*?>' ,'', text) 
    text = re.sub(r'\x89\S+' , ' ', text) #Removes string starting from \x89
    text = re.sub('\w*\d\w*', '', text)  # Removes numbers
    text = re.sub(r'[^\w\s]','',text)   # Removes Punctuations
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


**4. DATA PREPARATION**

Let us now clean and prepare our training data.

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: clean_dataset(x))

**4.1 EMBEDDING THE DATASET**

Let us first create a corpus before embedding our text by breaking the tweets and converting them to indivudual words and then removing the stop words from them.

In [None]:
#cleaned_df = pd.DataFrame(df_train , columns = ["text"])
cleaned_df = pd.DataFrame(df_train )
cleaned_df["List of Words"] = cleaned_df["text"].map(Convert)


In [None]:

cleaned_words = cleaned_df["List of Words"]
corpus = []
for wordlist in cleaned_words:
    for disaster_word in wordlist:
        if disaster_word not in stop_words:
            corpus.append(disaster_word)

Let us embed the dataset using Glove vector. To add the glove vector, use the "Add data" in the top right and add the glove vector using the URL " https://www.kaggle.com/rtatman/glove-global-vectors-for-word-representation".

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word]  =vectors
f.close()

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
from tqdm import tqdm
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

**5. MODEL BUILDING**

Let us build an LSTM model. 

MODEL ARCHITECTURE :

Our model consists of three layers :
* one embedding layer
* one LSTM layer and 
* one output layer.

In [None]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Bidirectional,Flatten,Dropout
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Embedding(num_words,100,input_length=MAX_LEN,weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(12,dropout=0.2,return_sequences=False)))
model.add(Flatten())
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation='softmax'))
model.summary()


In [None]:
optimzer = Adam(learning_rate=1e-5)
model.compile(loss='sparse_categorical_crossentropy',optimizer=optimzer,metrics=['accuracy'])

Let us split the data set into train and validation sets and use them for model training.

In [None]:
#epochs = 50
#batch_size = 32
#history = model.fit(x=after_padding_sequence,y=y,validation_split=0.2,epochs=epochs,batch_size=batch_size,verbose=1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
#print(type(df_train['target'].values))


In [None]:
print(df_train.shape[0])

In [None]:
from sklearn.model_selection import train_test_split
train=tweet_pad[:cleaned_df.shape[0]]
print(train.shape)
print(cleaned_df['target'].values.shape)
X_train,X_test,y_train,y_test = train_test_split(train,cleaned_df['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,
                  validation_data=(X_test,y_test),verbose=2)

**6.**INFERENCE****

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history.history['loss'],'-o',label="train")
plt.plot(history.history['val_loss'],'-o',label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss change over epoch for Model")
plt.legend()
plt.grid()

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history.history['accuracy'],label="train")
plt.plot(history.history['val_accuracy'],label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy change over epoch")
plt.legend()

In [None]:
cleaned_df_test = pd.DataFrame(df_test )
test_padded =  tweet_pad[:cleaned_df_test.shape[0]]
pred = model.predict(test_padded)

In [None]:
preds = []
for p in pred:
  preds.append(np.argmax(p))

In [None]:
submission_csv = pd.DataFrame()
submission_csv['id'] = cleaned_df_test['id']
submission_csv['target'] = preds

In [None]:
submission_csv.to_csv(f'/kaggle/working/submission.csv',index=False)

In [None]:
submission_csv.head(25)

**FUTURE DEVLOPMENTS**

The accuracy of the model can be further increased by :

* Increasing the number of epochs
* Adding more data cleaning steps like spell check etc
* Using different word embeddings or advanced models like BERT, Auto-encoders etc.
* We can also increase the number of memory cells in the LSTM network or try different optmisers like PRELU or changing the drop out etc.