# Sentiment Analysis using Bag-of-Words Approach

Implemented using the <a href="https://www.crowdflower.com/data-for-everyone/">Disasters on social media</a> dataset from Crowdflower. 

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

### Read Data

In [4]:
tweets = pd.read_csv('socialmedia-disaster-tweets-DFE.csv')[['text','choose_one']]
tweets.columns = ['tweet','class']
tweets = tweets[(tweets['class'] == 'Relevant') | (tweets['class'] == 'Not Relevant')]
tweets = tweets.reset_index(drop=True)
tweets.tail()

Unnamed: 0,tweet,class
10855,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,Relevant
10856,Police investigating after an e-bike collided ...,Relevant
10857,The Latest: More Homes Razed by Northern Calif...,Relevant
10858,MEG issues Hazardous Weather Outlook (HWO) htt...,Relevant
10859,#CityofCalgary has activated its Municipal Eme...,Relevant


### Remove any HTML and Emoticons from the tweets

In [6]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [7]:
tweets['tweet'] = tweets['tweet'].apply(preprocessor)

### Reindex the tweets dataframe so the rows are in random order

In [8]:
tweets = tweets.reindex(np.random.permutation(tweets.index))

print tweets.head()
print tweets.tail()

                                                   tweet         class
10647  cutting for some celebrety and then posting th...  Not Relevant
6758   heavy rain gusty winds and vivid lightning mov...      Relevant
7748   follow up at 4700 block of sw 11th st gresham ...      Relevant
3438   dozens die as two trains derail into a river i...      Relevant
9783   billionaire mottas try getting trapped money o...  Not Relevant
                                                   tweet         class
7094   mike magner discusses a trust betrayed http t ...  Not Relevant
8815                        ik4len sirens was cancelled   Not Relevant
9940   i had trouble breathing while listening to kia...  Not Relevant
10189  2012 shell s 250 foot tall drilling rig broke ...      Relevant
1193                          eddietrunk blizzard of ozz      Relevant


### Download stopwords from the NLTK, remove them from the tweets

In [9]:
def split_into_tokens(tweet):
    tweet = unicode(tweet, 'utf8')  # convert bytes into proper unicode
    return TextBlob(tweet).words

In [10]:
tweets.tweet.head().apply(split_into_tokens)

10647    [cutting, for, some, celebrety, and, then, pos...
6758     [heavy, rain, gusty, winds, and, vivid, lightn...
7748     [follow, up, at, 4700, block, of, sw, 11th, st...
3438     [dozens, die, as, two, trains, derail, into, a...
9783     [billionaire, mottas, try, getting, trapped, m...
Name: tweet, dtype: object

In [11]:
TextBlob("hello world, how is it going?").tags  # list of (word, POS) pairs

[('hello', u'JJ'),
 ('world', u'NN'),
 ('how', u'WRB'),
 ('is', u'VBZ'),
 ('it', u'PRP'),
 ('going', u'VBG')]

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dsg191\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stop = stopwords.words('english')
stop = stop + [u'a',u'b',u'c',u'd',u'e',u'f',u'g',u'h',u'i',u'j',u'k',u'l',u'm',u'n',u'o',u'p',u'q',u'r',u's',u't',u'v',u'w',u'x',u'y',u'z']

### Create a bag-of-words for the tweets, convert the words to lemmas and remove any stop words

In [14]:
def split_into_lemmas(tweet):
    tweet = unicode(tweet, 'utf8').lower()
    words = TextBlob(tweet).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words if word not in stop]

tweets.tweet.head().apply(split_into_lemmas)

10647     [cutting, celebrety, posting, wound, online, go]
6758     [heavy, rain, gusty, wind, vivid, lightning, m...
7748     [follow, 4700, block, sw, 11th, st, gresham, g...
3438     [dozen, die, two, train, derail, river, indiah...
9783     [billionaire, mottas, try, getting, trapped, m...
Name: tweet, dtype: object

In [15]:
%%time
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(tweets['tweet'])
print len(bow_transformer.vocabulary_)

26085
Wall time: 5.3 s


In [16]:
tweet4 = tweets['tweet'][456]
print tweet4

 enews ben affleck i know there s a wife kids and other girls but i can t help it i ve loved him since armageddon eonlinechat


In [17]:
bow4 = bow_transformer.transform([tweet4])
print bow4

  (0, 2142)	1
  (0, 2824)	1
  (0, 3627)	1
  (0, 8026)	1
  (0, 8101)	1
  (0, 9813)	1
  (0, 10701)	1
  (0, 13055)	1
  (0, 13210)	1
  (0, 14131)	1
  (0, 20729)	1
  (0, 24807)	1


In [18]:
%%time
tweets_bow = bow_transformer.transform(tweets['tweet'])
print 'sparse matrix shape:', tweets_bow.shape
print 'number of non-zeros:', tweets_bow.nnz
print 'sparsity: %.2f%%' % (100.0 * tweets_bow.nnz / (tweets_bow.shape[0] * tweets_bow.shape[1]))

sparse matrix shape: (10860, 26085)
number of non-zeros: 117482
sparsity: 0.04%
Wall time: 5.02 s


### Split the tweets into a training and testing set, using the first 8000 tweets for training and the remaining for testing.

In [19]:
tweets_bow_train = tweets_bow[:8000]
tweets_bow_test = tweets_bow[8000:]
tweets_class_train = tweets['class'][:8000]
tweets_class_test = tweets['class'][8000:]

print tweets_bow_train.shape
print tweets_bow_test.shape

(8000, 26085)
(2860, 26085)


### Apply Multinomial Naive Bayes model

In [20]:
%time tweet_class = MultinomialNB().fit(tweets_bow_train, tweets_class_train)

Wall time: 23 ms


In [21]:
print 'predicted:', tweet_class.predict(bow4)[0]
print 'expected:', tweets['class'][456]

predicted: Not Relevant
expected: Not Relevant


In [22]:
predictions = tweet_class.predict(tweets_bow_test)
print predictions

['Not Relevant' 'Not Relevant' 'Not Relevant' ..., 'Not Relevant'
 'Relevant' 'Not Relevant']


In [27]:
print 'accuracy', accuracy_score(tweets_class_test, predictions)
print 'confusion matrix\n', confusion_matrix(tweets_class_test, predictions)
print '(row=expected, col=predicted)'

accuracy 0.805944055944
confusion matrix
[[1407  255]
 [ 300  898]]
(row=expected, col=predicted)


In [28]:
print classification_report(tweets_class_test, predictions)

             precision    recall  f1-score   support

Not Relevant       0.82      0.85      0.84      1662
   Relevant       0.78      0.75      0.76      1198

avg / total       0.81      0.81      0.81      2860



### Test model with some text

In [30]:
def predict_tweet(new_tweet): 
    new_sample = bow_transformer.transform([new_tweet])
    print new_tweet, np.around(tweet_class.predict_proba(new_sample), decimals=5), tweet_class.predict(new_sample),"\n"

predict_tweet('Cat stuck in a tree.')
predict_tweet('Car accident. Major damage to property.')
predict_tweet('I ate a sandwich last night.')
predict_tweet('Somehow, Mr. Dreyfuss maintains his sound comic timing even when Frank Oz''s antic direction calls for hand-waving hysteria.')

Cat stuck in a tree. [[ 0.68802  0.31198]] ['Not Relevant'] 

Car accident. Major damage to property. [[ 0.03936  0.96064]] ['Relevant'] 

I ate a sandwich last night. [[ 0.90025  0.09975]] ['Not Relevant'] 

Somehow, Mr. Dreyfuss maintains his sound comic timing even when Frank Ozs antic direction calls for hand-waving hysteria. [[ 0.96873  0.03127]] ['Not Relevant'] 

