In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.5.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.3 MB/s eta 0:00:01
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.4


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string # Library for string operations

import os

# plotly library
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt #Another plotting libraray

# word cloud library
from wordcloud import WordCloud

#Regex library
import re

#Spell Checker
from spellchecker import SpellChecker 
spell = SpellChecker()

In [4]:
TrainDataSet= pd.read_csv('../input/nlp-getting-started/train.csv')
TestDataSet=pd.read_csv('../input/nlp-getting-started/test.csv')
TrainDataSet.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [5]:
TrainDataSet[TrainDataSet["target"] == 0]["text"].values[0:20]


array(["What's up man?", 'I love fruits', 'Summer is lovely',
       'My car is so fast', 'What a goooooooaaaaaal!!!!!!',
       'this is ridiculous....', 'London is cool ;)', 'Love skiing',
       'What a wonderful day!', 'LOOOOOOL',
       "No way...I can't eat that shit", 'Was in NYC last week!',
       'Love my girlfriend', 'Cooool :)', 'Do you like pasta?',
       'The end!',
       'We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw',
       'Crying out for more! Set me ablaze',
       'On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N',
       "@PhDSquares #mufc they've built so much hype around new acquisitions but I doubt they will set the EPL ablaze this season."],
      dtype=object)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0,stop_words="english", max_features=200)
counts = cv.fit_transform(TrainDataSet["text"][0:500]).toarray().ravel()
words = np.array(cv.get_feature_names()) 
#counts = counts / float(counts.max())
#print(words)
print(counts)

[0 0 0 ... 0 0 0]


# Exploratory Data Analysis of Tweets
* **Show percentage of tweets marked as Disaster tweets **

In [7]:
Grouped_Disaster = TrainDataSet.groupby(['target'])['id'].count().reset_index()
labels = ['Disaster','Non-Disaster']


# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=Grouped_Disaster['id'], hole=.4)])
fig.update_layout(title_text='Disaster Tweet Percentage',
                 annotations=[dict(text='#tweet', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()

* **sentence length analysis**
Loook at the number of characters present in each tweet. This can give us a idea about tweet length in our dataset.

In [8]:
TrainDataSet['tweetlength'] = TrainDataSet['text'].apply(lambda x:  len(str(x)))
TrainDataSet.head(5)

Unnamed: 0,id,keyword,location,text,target,tweetlength
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88


In [13]:
Top_20_Lengthy_Tweets = TrainDataSet.sort_values('tweetlength',ascending=False)[:20][::-1]
Bottom_20_Lengthy_Tweets = TrainDataSet.sort_values('tweetlength',ascending=True)[:20][::-1]
Tweetlength_Data =TrainDataSet['tweetlength'].describe()


fig = make_subplots(
    rows=3, cols=4,
    specs=[[None,{"type": "indicator"},{"type": "indicator"},{"type": "indicator"}],
           [{"type": "bar" ,"colspan": 2},None, {"type": "bar" ,"colspan": 2},None],
           [{"type": "bar","colspan": 4}, None,None,None]],
    subplot_titles=("","","","Top 20 Tweets by length","Bottom 20 Tweet by length")
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Tweetlength_Data[1],
        title="Mean Tweet Length",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Tweetlength_Data[3],
        title="Min Tweet Length",
    ),
    row=1, col=3
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Tweetlength_Data[7],
        title="Max Tweet Length",
    ),
    row=1, col=4
)


fig.add_trace(go.Bar(name='id',text='id', x=Top_20_Lengthy_Tweets['keyword'], y=Top_20_Lengthy_Tweets['tweetlength']),
              row=2, col=1)


fig.add_trace(go.Bar(name='id',text='id', x=Bottom_20_Lengthy_Tweets['keyword'], y=Bottom_20_Lengthy_Tweets['tweetlength']),
              row=2, col=3)
fig.add_trace(go.Bar(name='id',text='id', x=TrainDataSet['keyword'], y=TrainDataSet['tweetlength']),
              row=3, col=1)

fig.update_layout(height=1000,title_text="Tweets Length Analysis", showlegend=False)

fig.show()

* **Tweet Word count analysis**
Loook at the number of words present in each tweet. This can give us a idea about tweet word count in our dataset.

In [15]:
TrainDataSet['wordcount'] = TrainDataSet['text'].apply(lambda x:  len(str(x).split()))
TrainDataSet.head(5)

Unnamed: 0,id,keyword,location,text,target,tweetlength,wordcount
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [16]:
Top_20_Lengthy_Tweets = TrainDataSet.sort_values('wordcount',ascending=False)[:20][::-1]
Bottom_20_Lengthy_Tweets = TrainDataSet.sort_values('wordcount',ascending=True)[:20][::-1]
Wordlength_Data =TrainDataSet['wordcount'].describe()


fig = make_subplots(
    rows=3, cols=4,
    specs=[[None,{"type": "indicator"},{"type": "indicator"},{"type": "indicator"}],
           [{"type": "bar" ,"colspan": 2},None, {"type": "bar" ,"colspan": 2},None],
           [{"type": "bar","colspan": 4}, None,None,None]],
    subplot_titles=("","","","Top 20 Tweets word count","Bottom 20 Tweet by word count")
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[1],
        title="Mean Tweet word count",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[3],
        title="Min Tweet word count",
    ),
    row=1, col=3
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[7],
        title="Max Tweet word count",
    ),
    row=1, col=4
)


fig.add_trace(go.Bar(name='id',text='id', x=Top_20_Lengthy_Tweets['keyword'], y=Top_20_Lengthy_Tweets['wordcount']),
              row=2, col=1)


fig.add_trace(go.Bar(name='id',text='id', x=Bottom_20_Lengthy_Tweets['keyword'], y=Bottom_20_Lengthy_Tweets['wordcount']),
              row=2, col=3)
fig.add_trace(go.Bar(name='id',text='id', x=TrainDataSet['keyword'], y=TrainDataSet['wordcount']),
              row=3, col=1)

fig.update_layout(height=700,title_text="Tweets Word Count Analysis", showlegend=False)

fig.show()


* **Tweet stop word analysis**
Loook at the number of stop words present in each tweet. This can give us a idea about stop word usage in tweet.Stopwords are most commonly used in any language such as “the”,” a”,” an” etc. nltk library help us to find commonly used stopwords. Nltk contains stopwords for many languages so need to filter the English stopwords from the collection.

In [18]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

wordcollection=[]
TempTextCol= TrainDataSet['text'].str.split()
TempTextCol=TempTextCol.values.tolist()
wordcollection=[word for i in TempTextCol for word in i]

from collections import defaultdict
stopwprddic=defaultdict(int)
for word in wordcollection:
    if word in stop:
        stopwprddic[word]+=1

stopwprddf =  pd.DataFrame(stopwprddic.items(), columns=['word', 'count'])

stopwprddf.head()

Unnamed: 0,word,count
0,are,345
1,the,2575
2,of,1722
3,this,335
4,all,178


In [20]:
Top_20_Lengthy_Tweets = stopwprddf.sort_values('count',ascending=False)[:20][::-1]
Bottom_20_Lengthy_Tweets = stopwprddf.sort_values('count',ascending=True)[:20][::-1]
Wordlength_Data =stopwprddf['count'].describe()


fig = make_subplots(
    rows=3, cols=4,
    specs=[[None,{"type": "indicator"},{"type": "indicator"},{"type": "indicator"}],
           [{"type": "bar" ,"colspan": 2},None, {"type": "bar" ,"colspan": 2},None],
           [{"type": "bar","colspan": 4}, None,None,None]],
    subplot_titles=("","","","Top 20 Tweets stopword count","Bottom 20 Tweet by stopword count")
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[1],
        title="Mean stopwords",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[3],
        title="Min stopwords",
    ),
    row=1, col=3
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[7],
        title="Max Tweet stopwords",
    ),
    row=1, col=4
)


fig.add_trace(go.Bar(name='count',text='count', x=Top_20_Lengthy_Tweets['word'], y=Top_20_Lengthy_Tweets['count']),
              row=2, col=1)


fig.add_trace(go.Bar(name='count',text='count', x=Bottom_20_Lengthy_Tweets['word'], y=Bottom_20_Lengthy_Tweets['count']),
              row=2, col=3)
fig.add_trace(go.Bar(name='count',text='count', x=stopwprddf['word'], y=stopwprddf['count']),
              row=3, col=1)

fig.update_layout(height=600,title_text="Tweets StopWord Analysis", title_x=0.5, showlegend=False)

fig.show()


* **Tweet Puntuation and space word analysis**
Loook at the number of Puntuation/space words present in each tweet. This can give us a idea about Puntuation/space usage in tweet.Punctuations and space words also commonly used in any language. **spacy** library help us to find commonly used Puntuation/space words.

In [21]:
import spacy
nlp = spacy.load('en')

from collections import defaultdict
dicspy=defaultdict(int)


docs = TrainDataSet['text'].tolist()

def token_filter(token):
    return (token.is_punct | token.is_space )

filtered_tokens = []
for doc in nlp.pipe(docs):
    tokens = [token.lemma_ for token in doc if token_filter(token)]
    filtered_tokens.append(tokens)
    for tk in tokens:
        dicspy[tk]+=1

Punctuationdf =  pd.DataFrame(dicspy.items(), columns=['word', 'count'])

Punctuationdf.head()

Unnamed: 0,word,count
0,#,3362
1,.,2914
2,',1292
3,-,1399
4,",",1


In [23]:
Top_20_Lengthy_Tweets = Punctuationdf.sort_values('count',ascending=False)[:20][::-1]
Bottom_20_Lengthy_Tweets = Punctuationdf.sort_values('count',ascending=True)[:20][::-1]
Wordlength_Data =Punctuationdf['count'].describe()


fig = make_subplots(
    rows=3, cols=4,
    specs=[[None,{"type": "indicator"},{"type": "indicator"},{"type": "indicator"}],
           [{"type": "bar" ,"colspan": 2},None, {"type": "bar" ,"colspan": 2},None],
           [{"type": "bar","colspan": 4}, None,None,None]],
    subplot_titles=("","","","Top 20 Tweets Puntuation or Space count","Bottom 20 Tweet by Puntuation or Space count")
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[1],
        title="Mean ",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[3],
        title="Min",
    ),
    row=1, col=3
)

fig.add_trace(
    go.Indicator(
        mode="number",
        value=Wordlength_Data[7],
        title="Max",
    ),
    row=1, col=4
)


fig.add_trace(go.Bar(name='count',text='count', x=Top_20_Lengthy_Tweets['word'], y=Top_20_Lengthy_Tweets['count']),
              row=2, col=1)


fig.add_trace(go.Bar(name='count',text='count', x=Bottom_20_Lengthy_Tweets['word'], y=Bottom_20_Lengthy_Tweets['count']),
              row=2, col=3)
fig.add_trace(go.Bar(name='count',text='count', x=Punctuationdf['word'], y=Punctuationdf['count']),
              row=3, col=1)

fig.update_layout(height=600,title_text="Tweets Punctuation and Space Analysis", title_x=0.5, showlegend=False)

fig.show()


In [15]:
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
def getNgram(wordCollection, n=None):
    vectorData = CountVectorizer(ngram_range=(n, n)).fit(wordCollection)
    BagOfWords = vectorData.transform(wordCollection)
    SumWords = BagOfWords.sum(axis=0) 
    WordsFq = [(word, SumWords[0, idx]) 
                  for word, idx in vectorData.vocabulary_.items()]
    WordsFq =sorted(WordsFq, key = lambda x: x[1], reverse=True)
    return WordsFq[:10]

getBigrams=getNgram(TrainDataSet['text'],2)[:10]
x,y=map(list,zip(*getBigrams))

import plotly.express as px
fig = px.bar(x=y,y=x)
fig.show()


In [16]:
triGrams=getNgram(TrainDataSet['text'],n=3)
x,y=map(list,zip(*triGrams))
fig = px.bar(x=y,y=x)
fig.show()

# Data Cleaning

We need to clean the data to avoid errors and incorrect results

* **Remove URL from the tweet**
URL'S some error during processing so we are using regex library to remove the urls

In [None]:
TrainDataSet['text'] = TrainDataSet['text'].apply(lambda x: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', str(x)))
TrainDataSet.head(3)

In [None]:
TestDataSet['text'] = TestDataSet['text'].apply(lambda x: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', str(x)))
TestDataSet.tail(3)

* **Remove Emoji from the tweet**
we need to remove emoji from the tweet since people are using lot of emojies in there tweet to express emotions. We need to create function so we can specify different emoji patterns with range of unicode characters, the list is not complete but good for now.

In [None]:
def EmojiCleanser(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [None]:
TestDataSet['text'] = TestDataSet['text'].apply(lambda x: EmojiCleanser(str(x)))
TrainDataSet['text'] = TrainDataSet['text'].apply(lambda x: EmojiCleanser(str(x)))
TrainDataSet.tail(3)

* **Remove HTML Tags from Tweet**
we need to remove html tags so we can avoid creating unncessary tokens. we can use a regex expression to remove those tags.

In [None]:
TestDataSet['text'] = TestDataSet['text'].apply(lambda x: re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', str(x)))
TrainDataSet['text'] = TrainDataSet['text'].apply(lambda x: re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', str(x)))
TrainDataSet.tail(3)

* **Remove Punctuations from the Tweet**
we need to remove the puntuations from tweet so we are using string libraray to remove the punctuations

In [None]:
TestDataSet['text'] = TestDataSet['text'].apply(lambda x: str(x).translate(str.maketrans('','',string.punctuation)))
TrainDataSet['text'] = TrainDataSet['text'].apply(lambda x: str(x).translate(str.maketrans('','',string.punctuation)))
TrainDataSet.tail(3)


*** Spell correction in tweets**
We need to correct the spelling in tweets so we will get more accurate tokens. We can use SpellChecker in pyspellchecker library.

In [None]:
#TestDataSet['text'] = TestDataSet['text'].apply(lambda x: " ".join([spell.correction(i) for i in str(x).split()]))
#TrainDataSet['text'] = TrainDataSet['text'].apply(lambda x: " ".join([spell.correction(i) for i in str(x).split()]))
#TrainDataSet.tail(3)


* **Remove Stopwords from tweets** We need to remove Stopwords in tweets so we will get more accurate tokens. We can use stopwords in nltk library.

In [None]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

def StopWordCleanser(word):
    if word in stop:
        return ""
    else:
        return word

test = TrainDataSet[:10]

test['text'] = test['text'].apply(lambda x: " ".join([StopWordCleanser(i) for i in str(x).split()]))

test

In [None]:
# filling missing values 
#TrainDataSet[['keyword']] = TrainDataSet[['keyword']].fillna('Not Identified')
#TrainDataSet[['location']] = TrainDataSet[['location']].fillna('Not Identified')

TrainDataSet = TrainDataSet[TrainDataSet.keyword.notnull()]
TrainDataSet = TrainDataSet[TrainDataSet.location.notnull()]


Grouped_Disaster = TrainDataSet.groupby(['keyword'])['id'].count().reset_index()
Grouped_Location = TrainDataSet.groupby(['location'])['id'].count().reset_index()

Grouped_Disaster = Grouped_Disaster.query('keyword !="Not Identified"' )
Grouped_Location = Grouped_Location.query('location !="Not Location"' )

Group_Disaster_filter = Grouped_Disaster.sort_values('id',ascending=False)[:20][::-1]
Grouped_Location_filter = Grouped_Location.sort_values('id',ascending=False)[:20][::-1]

fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "bar"}, {"type": "bar"}]],
    subplot_titles=("Top 20 Disaster by Tweets","Top 20 Tweet Location")
)

fig.add_trace(go.Bar(name='id',text='id', x=Group_Disaster_filter['keyword'], y=Group_Disaster_filter['id']),
              row=1, col=1)


fig.add_trace(go.Bar(name='id',text='id', x=Grouped_Location_filter['location'], y=Grouped_Location_filter['id']),
              row=1, col=2)

fig.update_layout(height=700,title_text="Tweets Breakdown", showlegend=False)

fig.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
text = TrainDataSet.text.values
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#TrainDataSet.tail(20)
TrainDataSet[TrainDataSet["target"] == 0]["text"].values[0:120]
