<a href="https://colab.research.google.com/github/daaaanish17/Nlp-with-Disaster-Tweets/blob/main/Nlp_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #Regular Expression Library

import nltk #Natual Langage Toolkit
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Feature Selection

In [3]:
# Dropping unnecessary columns

train = train.drop(['id',	'keyword',	'location'], axis=1)
train.columns 

Index(['text', 'target'], dtype='object')

DATA CLEANING AND PRE PROCESSING

In [4]:
# Checking for null values

train.isnull().sum()

text      0
target    0
dtype: int64

In [5]:
# Removing URLs

def remove_url(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r'', text)

train['text'] = train['text'].apply(remove_url) 

In [6]:
#REMOVING HTML TAGS
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)
train['text'] = train['text'].apply(remove_html)

In [7]:
#removing pictures/tags/symbols and emojis 
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)
train['text'] = train['text'].apply(remove_emojis)

In [8]:
#removing punctuations
import string
def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)
train['text'] = train['text'].apply(remove_punct)

In [9]:
# downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
train.head(3)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1
1,Forest fire near La Ronge Sask Canada,1
2,All residents asked to shelter in place are be...,1


In [13]:
lemmatizer = WordNetLemmatizer()
for i in range(0, len(train)):
    review = re.sub('[^a-zA-Z]', ' ', train['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    train.loc[[i], ['text']] = review 

In [14]:
train.head(3)

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1


In [17]:
# checking if dataset is balanced or not
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [18]:
#converting data into numerical matrix

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train["text"])

In [20]:
count_vectorizer.get_feature_names_out()

array(['aa', 'aaaa', 'aaaaaaallll', ..., 'zurich', 'zxathetis', 'zzzz'],
      dtype=object)

In [21]:
train_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
train_vectors.shape

(7613, 15362)

Our Model

In [22]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, train_vectors, train.target, cv=5)
scores

array([0.71766251, 0.64346684, 0.70059094, 0.70039422, 0.74375821])

In [25]:
clf.fit(train_vectors, train["target"])

MultinomialNB()

Working with Test Dataset

In [26]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


Feature Selection

In [28]:
# Dropping unnecessary columns

test = test.drop(['id',	'keyword',	'location'], axis=1)
test.columns 

Index(['text'], dtype='object')

DATA CLEANING AND PRE PROCESSING

In [29]:
# Checking for null values

test.isnull().sum()

text    0
dtype: int64

In [30]:
# Removing URLs
test['text'] = test['text'].apply(remove_url) 

In [31]:
#REMOVING HTML TAGS
test['text'] = test['text'].apply(remove_html)

In [32]:
#removing pictures/tags/symbols and emojis 
test['text'] = test['text'].apply(remove_emojis)

In [33]:
#removing punctuations
test['text'] = test['text'].apply(remove_punct)

In [34]:
test.head(3)

Unnamed: 0,text
0,Just happened a terrible car crash
1,Heard about earthquake is different cities sta...
2,there is a forest fire at spot pond geese are ...


In [35]:
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ', test['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    test.loc[[i], ['text']] = review 

In [36]:
test.head(3)

Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different city stay safe ever...
2,forest fire spot pond goose fleeing across str...


In [37]:
test_vectors = count_vectorizer.transform(test["text"])

In [38]:
test_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [40]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [41]:
sample_submission["target"] = clf.predict(test_vectors)

In [42]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [45]:
sample_submission.to_csv("submission.csv", index=False)