In [1]:
import os
import re
import string
import numpy as np 
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib
from matplotlib import patches as mpatches
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df_train.head()

In [3]:
df_test.head()

In [4]:
df_train.shape, df_test.shape

# Exploratory Analysis

In [5]:
df_train.describe(include='object')

In [6]:
df_train.target.value_counts()

In [7]:
df_train.target.value_counts(normalize=True)

In [8]:
df_train.location.value_counts()

In [9]:
df_train.keyword.value_counts()

In [10]:
df_train['chars_len'] = df_train.text.apply(len)

In [11]:
df_train[['text','chars_len']].head(20)

In [12]:
df_train.text.str.split()

In [13]:
df_train['words_len'] = df_train.text.str.split().apply(len)
df_train[['text','words_len']].head()

In [14]:
sns.displot(df_train.words_len)

In [15]:
sns.displot(df_train[df_train.target==1].words_len)

In [16]:
sns.displot(df_train[df_train.target==0].words_len)

In [17]:
sns.displot(df_train.chars_len)

In [18]:
sns.displot(df_train[df_train.target==1].chars_len)

In [19]:
sns.displot(df_train[df_train.target==0].chars_len)

In [20]:
df_train[df_train.target==0].describe()

In [21]:
df_train[df_train.target==1].describe()

We can't see a relevant difference in the distribuion of the words but looking at mean and median of the text we see there is a small trend to real disaster tweets have more characters.


Let's clean the data so we can do more stuff.

In [22]:
def remove_punctuation(text):
    if(isinstance(text, str)): 
        for char in text:
            if (char in string.punctuation): 
                text = text.replace(char, ' ')
    return text

In [23]:
def preprocess_series(series):
    series = series.str.replace(r"http\S+", "")
    series = series.str.replace(r"http", "")
    series = series.str.replace(r"@\S+", "")
    series = series.str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    series = series.str.replace(r"@", "at")
    series = series.str.lower()
    return series

In [24]:
df_train['location_cln'] = df_train.location.apply(remove_punctuation)
df_train['keyword_cln'] = df_train.keyword.apply(remove_punctuation)
df_train['text_cln'] = df_train.text.apply(remove_punctuation)
df_train['location_cln'] = preprocess_series(df_train.location_cln)
df_train['keyword_cln'] = preprocess_series(df_train.keyword_cln)
df_train['text_cln'] = preprocess_series(df_train.text_cln)

In [25]:
#remove the 100 filter when clean the texts 
keywords = df_train[df_train.keyword_cln.isin(df_train.keyword_cln.value_counts()[:20].index)]
plt.figure(figsize=(18,10))
sns.countplot(x='keyword_cln', data=keywords, hue='target')
plt.xticks(rotation=45)
plt.show()

In [26]:
#remove the 100 filter when clean the texts 
location = df_train[df_train.location_cln.isin(df_train.location_cln.value_counts()[:20].index)]
plt.figure(figsize=(18,10))
sns.countplot(x='location_cln', data=location, hue='target')
plt.xticks(rotation=45)
plt.show()

In [27]:
df_train[df_train.target == 1].location_cln.value_counts()[:20].plot(kind='bar')

In [28]:
df_train[df_train.target == 0].location_cln.value_counts()[:20].plot(kind='bar')

In [29]:
df_train[df_train.target == 1].keyword_cln.value_counts()[:20].plot(kind='bar')

In [30]:
df_train[df_train.target == 0].keyword_cln.value_counts()[:20].plot(kind='bar')

In [31]:
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub(r"@\S+", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", text)
    text = re.sub(r"@", "at", text)
    text = text.lower()
    return text

In [32]:
#Creating a corpus with frequencies
stopwords = set(STOPWORDS)
def create_corpus_freq(series):
    dic = {}
    for row in series.str.split():
        for word in row:
#             word = preprocess_text(word)
            if(word not in stopwords):
                if(word in dic.keys()): dic[word] += 1
                else: dic[word] = 1
    return dic

In [33]:
df_train.text_cln.value_counts(dropna=False)

In [34]:
full_freq = create_corpus_freq(df_train.text_cln)
target1_freq = create_corpus_freq(df_train.text_cln)
target0_freq = create_corpus_freq(df_train.text_cln)

In [35]:
    
plt.figure(figsize=(10,8))
wordcloud = WordCloud(width=600, height=400, ).generate_from_frequencies(frequencies=full_freq)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [36]:

plt.figure(figsize=(10,8))
wordcloud = WordCloud(width=600, height=400,).generate_from_frequencies(frequencies=target1_freq)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [37]:
plt.figure(figsize=(10,8))
wordcloud = WordCloud(width=600, height=400,).generate_from_frequencies(frequencies=target0_freq)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [38]:
top = sorted(full_freq.items(), key=lambda item: item[1], reverse=True)[:20]
x , y =zip(*top)
print(x)
print(y)

In [39]:
plt.figure(figsize=(10,8))
sns.barplot(np.array(x) ,np.array(y))
# plt.bar(x,y)
plt.xticks(rotation=45)
plt.show()

Let's remove this number to see if we got some other words

In [40]:
full_freq_no_digits = {k: v for k, v in full_freq.items() if not k.isdigit()}

In [41]:
top = sorted(full_freq_no_digits.items(), key=lambda item: item[1], reverse=True)[:20]
x, y = zip(*top)
print(x)
print(y)

In [42]:
plt.figure(figsize=(10,8))
sns.barplot(np.array(x) ,np.array(y))
plt.xticks(rotation=45)
plt.show()

In [43]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

corpus = df_train["text_cln"]
target = df_train["target"]

X_train, X_test, y_train, y_test = train_test_split(corpus, target, 
                                                    test_size=0.2, random_state=13)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [44]:
def plot_LSA(test_data, test_labels, plot=True):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ['orange','blue','blue']
        if plot:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, 
                        c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
            red_patch = mpatches.Patch(color='orange', label='Irrelevant')
            green_patch = mpatches.Patch(color='blue', label='Disaster')
            plt.legend(handles=[red_patch, green_patch], prop={'size': 30})


fig = plt.figure(figsize=(16, 16))          
plot_LSA(X_train_counts, y_train)
plt.show()

Creating and plotting a bag of words could show us that our classes are very close to each other in space :s Will probably be difficult to differentiate them. 

In [45]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [46]:
tk = Tokenizer()
tk.fit_on_texts(X_train)
sequences = tk.texts_to_sequences(X_train)
sequences[:5]

Let's pad the sequencies so we can construct a matrix

In [47]:
MAX_LEN = 31 #max of words in a tweet in words_len.describe
tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',
                          padding='post')

In [48]:
tk.fit_on_texts(X_test)
seq_test = tk.texts_to_sequences(X_test)
tweet_pad_test = pad_sequences(seq_test,maxlen=MAX_LEN,truncating='post',
                             padding='post')

In [49]:
tweet_pad.shape

In [50]:
df_train.shape

In [51]:
len(X_train)

Constructing a new matrix using the information from GloVe about our words 

In [52]:
word_index = tk.word_index

num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,100)) #using GloVe with 100 dim

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

In [53]:
model = Sequential()

embedding = Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


optimzer = Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy', optimizer=optimzer, metrics=['accuracy'])

In [54]:
X_train, X_valid, y_train, y_valid = train_test_split(tweet_pad, y_train, test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_valid.shape)

In [55]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, 
                    validation_data=(X_valid, y_valid), verbose=2)

In [56]:
loss,acc = model.evaluate(tweet_pad_test, y_test, )
print('Results on test vaues: \nLoss:{:.2f} \nAcc:{:.2f}'.format(loss,acc))

In this model we can classificate correctly around 60% of the tweets.