# Natural Language Processing with Disaster Tweets

## Importing Libraries and Loading Data

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
from html import unescape
import re
import nltk
import string
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Loading Data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
#Checking for NULL Values
train.isna().sum(), test.isna().sum()

(id             0
 keyword       61
 location    2533
 text           0
 target         0
 dtype: int64,
 id             0
 keyword       26
 location    1105
 text           0
 dtype: int64)

Since the most significant features to be considered are the text, target and ID, and since these three features do not contain any null values, no rows need to be deleted. The missing values can be filled in with the most frequent value of that feature.

In [4]:
#Filling NULL Values
train_mode = train[['keyword', 'location']].mode()
train['location'] = train['location'].fillna(train_mode['location'][0])
train['keyword'] = train['keyword'].fillna(train_mode['keyword'][0])

test_mode = test[['keyword', 'location']].mode()
test['location'] = test['location'].fillna(test_mode['location'][0])
test['keyword'] = test['keyword'].fillna(test_mode['keyword'][0])

#Confirming there are no more NULL Values
train.isna().sum(), test.isna().sum()

(id          0
 keyword     0
 location    0
 text        0
 target      0
 dtype: int64,
 id          0
 keyword     0
 location    0
 text        0
 dtype: int64)

# Processing Data

In [5]:
train['processed text'] = train['text'].apply(lambda x: unescape(x))
test['processed text'] = test['text'].apply(lambda x: unescape(x))

In [6]:
#Making words lower case
train['processed text'] = train['processed text'].str.lower()
test['processed text'] = test['processed text'].str.lower()

In [7]:
#Removing punctuation
tb = str.maketrans('', '', string.punctuation)
train['processed text'] = [train['processed text'][r].translate(tb) for r in range(train['processed text'].shape[0])]
test['processed text'] = [test['processed text'][r].translate(tb) for r in range(test['processed text'].shape[0])]

In [8]:
#Removing words with less than 3 letters
train['processed text'] = train['processed text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) >= 3]))
test['processed text'] = test['processed text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) >= 3]))

In [9]:
#Removing words that occur only once
l_train = train['processed text'].str.split().tolist()
l_test = test['processed text'].str.split().tolist()

c_train = Counter(chain.from_iterable(l_train))
c_test = Counter(chain.from_iterable(l_test))

train['processed text'] = [' '.join([w for w in t if c_train[w] > 1]) for t in l_train]
test['processed text'] = [' '.join([w for w in t if c_test[w] > 1]) for t in l_test]

In [10]:
#Further processing
nltk.download('stopwords')
wrds = nltk.corpus.stopwords.words("english")
stm = nltk.stem.PorterStemmer()
train['processed text'] = train['text'].apply(lambda x: " ".join([stm.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in wrds]).lower())
test['processed text'] = test['text'].apply(lambda x: " ".join([stm.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in wrds]).lower())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dlytten\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Splitting Data

In [11]:
#Seperating Input and Target Data
X = train["processed text"]
Y = train["target"]
X_test = test["processed text"]

In [12]:
#Splitting into Training and Validation Data
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, stratify = Y, test_size = 0.15, random_state = 42, shuffle = True)

In [13]:
vec = TfidfVectorizer(stop_words = "english", max_df = 0.7)
train_tfidf = vec.fit_transform(X_train.values.astype('U'))
valid_tfidf = vec.transform(X_valid.values.astype('U'))
test_tfidf = vec.transform(X_test.values.astype('U'))

## Training the Model

In [14]:
model = MLPClassifier(early_stopping = True)
model.fit(train_tfidf, Y_train)
hyperparameters = dict()

hypt = GridSearchCV(model, hyperparameters, cv = 6, verbose = 0)
best_m = hypt.fit(train_tfidf, Y_train)

Checking Model Accuracy

In [15]:
best_m.score(train_tfidf, Y_train)

0.8995518467006645

In [16]:
best_m.predict(valid_tfidf)
best_m.score(valid_tfidf, Y_valid)

0.8178633975481612

## Predicting Values for Test Data

In [17]:
Y_pred = best_m.predict(test_tfidf)
Y_pred

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

## Saving the Predicted Values for Submission

In [18]:
Submission = pd.DataFrame({"id" : test["id"], "target" : Y_pred})
Submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [19]:
Submission.to_csv("Submission.csv", index = False)