# importing libraries

In [1]:
#for data preparation
import numpy as np
import pandas as pd

#for data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#for string
import re
import string


#for text processing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from textblob import TextBlob

#model 
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#for deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

import warnings
warnings.simplefilter(action='ignore')


In [2]:
#load data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
#check for null values
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


There are a lot of null values in 2 columns.let us drop the columns that are useless

In [4]:
train.drop(['keyword','location','id'],inplace=True,axis=1)
test.drop(['location','keyword','id'],inplace=True,axis=1)

In [5]:
test.tail()

Unnamed: 0,text
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...
3262,#CityofCalgary has activated its Municipal Eme...


In [6]:
stop=stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

# Text Preprocessing

In [7]:
def lowercasing(df):
    df['cleaned_text']=df['text'].apply(lambda x:' '.join(x.lower() for x in x.split()))

def remove_links(df):
    df['cleaned_text']=df['cleaned_text'].apply(lambda x:' '.join([x for x in x.split() if x[:3] != 'http']))

def remove_punctuation(df):
    df['cleaned_text']=df['cleaned_text'].str.replace('[^\w\s]','')

def remove_stopwords(df):
    df['cleaned_text']=df['cleaned_text'].apply(lambda x:' '.join([x for x in x.split() if x not in stop]))

def lemmatization(df):
    lemm=WordNetLemmatizer()
    df['cleaned_text']=df['cleaned_text'].apply(lambda x:' '.join([lemm.lemmatize(x) for x in x.split()]))
    return (df[['text','cleaned_text']].head())
    
    

In [8]:
#transforming training data
lowercasing(train)
remove_links(train)
remove_punctuation(train)
remove_stopwords(train)
lemmatization(train)



Unnamed: 0,text,cleaned_text
0,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...
3,"13,000 people receive #wildfires evacuation or...",13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...


In [9]:
#transforming test data
lowercasing(test)
remove_links(test)
remove_punctuation(test)
remove_stopwords(test)
lemmatization(test)

Unnamed: 0,text,cleaned_text
0,Just happened a terrible car crash,happened terrible car crash
1,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...
2,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan


# Bag of words model

In [10]:
#countvectorizer
cv=CountVectorizer()
X_train_cv=cv.fit_transform(train['cleaned_text'])
X_test_cv=cv.transform(test['cleaned_text'])

y_train=train['target']

In [11]:
#tfidf vectorizer
tf=TfidfVectorizer()
X_train_tf=cv.fit_transform(train['cleaned_text'])
X_test_tf=cv.transform(test['cleaned_text'])



In [12]:
clf=LogisticRegression()
clf2=LogisticRegression()

In [13]:
#fitting on countvectorizer data
clf.fit(X_train_cv,y_train)
predicts_cv=clf.predict(X_test_cv)
print('Score on training data:',clf.score(X_train_cv,y_train))

Score on training data: 0.9655851832391961


In [14]:
#fitting on tfidf vectorizer
clf2.fit(X_train_tf,y_train)
predicts_tf=clf2.predict(X_test_tf)
print('Score on training data:',clf2.score(X_train_tf,y_train))

Score on training data: 0.9655851832391961


In [15]:
#countvectorizer classificaiton with C value
clf3=LogisticRegression(C=1)
clf3.fit(X_train_cv,y_train)
predicts_cv=clf3.predict(X_test_cv)
print('Score on training data:',clf3.score(X_train_cv,y_train))

Score on training data: 0.9655851832391961


# Deep learning model

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
tokenizer=Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train['cleaned_text'])
train_matrix=tokenizer.texts_to_sequences(train['cleaned_text'])
test_matrix=tokenizer.texts_to_sequences(test['cleaned_text'])

In [18]:
#finding the maxlen of tweet
train['length']=train['cleaned_text'].apply(lambda x:len([x for x in x.split()]))
train['length'].max()

25

In [19]:
maxlen=25
train_matrix=pad_sequences(train_matrix,padding='post',maxlen=maxlen,truncating='post')
test_matrix=pad_sequences(test_matrix,padding='post',maxlen=maxlen,truncating='post')


In [20]:
train_matrix.shape,test_matrix.shape

((7613, 25), (3263, 25))

In [21]:
model=models.Sequential([
    layers.Embedding(10000,20,input_length=maxlen),
    layers.Flatten(),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 20)            200000    
_________________________________________________________________
flatten (Flatten)            (None, 500)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                16032     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 216,065
Trainable params: 216,065
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [24]:
hist=model.fit(train_matrix,y_train,epochs=12,verbose=1,validation_split=0.3)

Train on 5329 samples, validate on 2284 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


# Deep learning model 2

In [25]:
model2=models.Sequential([
    layers.Embedding(10000,20,input_length=maxlen),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [26]:
model2.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [27]:
hist2=model2.fit(train_matrix,y_train,epochs=8,verbose=1,validation_split=0.3)

Train on 5329 samples, validate on 2284 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Deep learning model 3

In [36]:
model3=models.Sequential([
    layers.Embedding(10000,20,input_length=maxlen),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [37]:
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [38]:
hist3=model3.fit(train_matrix,y_train,epochs=8,verbose=1,validation_split=0.3)

Train on 5329 samples, validate on 2284 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
