In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import numpy as np

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

from collections import defaultdict
from collections import Counter

import re
import gensim
import string

import os


In [49]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

train.head(5)

In [3]:
print(f"There are {train.shape[0]} rows and {train.shape[1]} columns.")
print(f"There are {test.shape[0]} rows and {test.shape[1]} columns.")

## Class Distribution 

In [4]:
x = train.target.value_counts()
sns.barplot(x=x.index, y=x)
plt.gca().set_ylabel("samples")
plt.gca().set_xlabel("No DIsaster vs Disaster")

## EDA

In [5]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
train_len1 = train[train['target'] == 1]['text'].str.len()
ax1.hist(train_len1, color='red')
ax1.set_title("Disaster Tweet")
train_len2 = train[train['target'] == 0]['text'].str.len()
ax2.hist(train_len2, color='green')
ax2.set_title("Non disaster Tweet")
fig.suptitle("Characters in tweet")
plt.show()

print(train_len1.shape, train_len2.shape)

In [6]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
train_len1 = train[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(train_len1, color='red')
ax1.set_title("Disaster Tweets")

train_len2 = train[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(train_len1, color='green')
ax2.set_title("Non Disaster Tweets")
fig.suptitle("Words in tweet")
plt.show()

In [7]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
train_len1 = train[train['target'] == 1]['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.histplot(train_len1.map(lambda x: np.mean(x)), ax=ax1, color='red', kde=True)
ax1.set_title('Disaster')

train_len2 = train[train['target'] == 0]['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.histplot(train_len1.map(lambda x: np.mean(x)), ax=ax2, color='green', kde=True)
ax1.set_title('Non Disaster')
fig.suptitle("Words")

## Creating Corpus

In [8]:
def create_corpus(target):
    corpus = []
    texts = train[train['target'] == target]['text'].str.split()
    
    for text in texts:
        for word in text:
            corpus.append(word)
            
    return corpus

In [9]:
corpus = create_corpus(0)

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word]+=1
        
        
top = sorted(dic.items(), key = lambda x:x[1], reverse=True)[:20]

x, y = zip(*top)
plt.bar(x,y)

In [10]:
corpus = create_corpus(1)

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word]+=1
        
        
top = sorted(dic.items(), key = lambda x:x[1], reverse=True)[:20]

x, y = zip(*top)
plt.bar(x,y)

## Punctuations

In [11]:
plt.figure(figsize=(12,6))
corpus = create_corpus(1)

dic = defaultdict(int)
punc = string.punctuation
for word in (corpus):
    if word in punc:
        dic[word] += 1
        
x,y = zip(*dic.items())
plt.bar(x,y)

In [12]:
plt.figure(figsize=(12,6))
corpus = create_corpus(0)

dic = defaultdict(int)
punc = string.punctuation
for word in (corpus):
    if word in punc:
        dic[word] += 1
        
x,y = zip(*dic.items())
plt.bar(x,y)

## Common words

In [13]:
corpus = create_corpus(0)
counter = Counter(corpus)

most = counter.most_common()
x=[]
y=[]

for word,count in most[:50]:
    if(word not in stop):
        x.append(word)
        y.append(count)
        
sns.barplot(x=x,y=y)

In [14]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    print(bag_of_words.shape)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [15]:
plt.figure(figsize=(12,6))
top_tweet_bigrams = get_top_tweet_bigrams(train['text'])[:10]
x,y = map(list, zip(*top_tweet_bigrams))
sns.barplot(x=x, y=y)

## Data Cleaning

In [16]:
df = pd.concat([train, test])
print(df.shape)

## Removing URls

In [17]:
example="My github profile :https://www.github.com/chakiAunkit"

In [18]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

print(remove_url(example))

In [19]:
df['text'] = df['text'].apply(lambda x: remove_url(x))

## Removing html tags

In [20]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [21]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

print(remove_html(example))

In [22]:
df['text'] = df['text'].apply(lambda x: remove_html(x))

In [23]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [24]:
df['text'] = df['text'].apply(lambda x: remove_emoji(x))

In [25]:
!pip install pyspellchecker

In [26]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    correct_text = []
    misspelled_words = spell.unknown(text.split())
    
    for word in text.split():
        if word in misspelled_words:
            correct_text.append(spell.correction(word))
        else:
            correct_text.append(word)
            
    return " ".join(correct_text)

ex = "what if teh speling was incorect"
correct_spelling(ex)

In [27]:
df['text'] = df['text'].apply(lambda x: correct_spelling(x))

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Bidirectional
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

## GloVe vectorization

In [29]:
def create_corpus(df):
    corpus = []
    for text in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(text) if((word.isalpha() == 1) & (word.lower() not in stop))]
        corpus.append(words)
        
    return corpus

In [30]:
corpus = create_corpus(df)

In [31]:
embedding_dict = {}

with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', 'r') as f:
    
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
        
f.close()

In [32]:
MAX_LENGTH = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

text_padded = pad_sequences(sequences, maxlen=MAX_LENGTH, truncating='post', padding='post')

In [33]:
word_index = tokenizer_obj.word_index
print('Number of unique words: ', len(word_index))

In [34]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

In [35]:
model = Sequential()

embedding = Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LENGTH, trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=1e-5)
model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics= ['accuracy'])

In [36]:
model.summary()

In [37]:
train2=text_padded[:train.shape[0]]
test2=text_padded[train.shape[0]:]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(train2, train['target'].values, test_size=0.2)
print("Shape of training: ", X_train.shape)
print("Shape of testing: ", X_test.shape)

In [39]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10, validation_data=(X_test, y_test), verbose=2)

In [40]:
plt.plot(history.history['accuracy'])
plt.title("Model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(['train'], loc='upper left')
plt.show()

In [41]:
plt.plot(history.history['loss'])
plt.title("Model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(['train'], loc='upper left')
plt.show()

## Submission


In [46]:
sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [51]:
y_pred=model.predict(test2)

In [53]:
sample_sub['target'] = y_pred.round().astype(int)
sample_sub.to_csv('submission.csv', index=False)

In [55]:
sample_sub.head()