In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

---
Tweet Sentiment Analysis
=====
***

###Create a twitter classifier with Naive Bayes and check the sentiment with a keyword of your choice
###Use tweet_training.csv as your training set
###Sentiment is described as 'polarity', where
### - '0' = negative
### - '4' = positive

In [2]:
df = pd.read_csv('/Users/mrgholt/GADS-22-NYC/Datasets/tweet_training.csv', delimiter=';')

In [3]:
df[df['polarity']==4].shape

(632, 3)

In [4]:
df[df['polarity']==0].shape

(1402, 3)

####The data is "unbalanced" in the sense that the number of 0 polarity tweets outnumbers the 4 polarity tweets. 

- This represents different priors. 

- Sklean's Naive Bayes modeling takes into account the different class representations, i.e. piors for you

- BUT you can alter the model by inputting your own priors ('fit_prior')

- NB: LaPlacian Correction. In addition these algorithms use a very slightly modified formula so as to account for zero probabilities. This is referred to as 'smoothing priors' - ('alpha')

In [5]:
df.head()

Unnamed: 0,id,polarity,tweet
0,1467933112,0,the angel is going to miss the athlete this we...
1,2323395086,0,It looks as though Shaq is getting traded to C...
2,1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
3,1990283756,0,drinking a McDonalds coffee and not understand...
4,1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [6]:
df[df['polarity']==4].head()

Unnamed: 0,id,polarity,tweet
77,1680347120,4,@ mcdonalds with my litto sis aka cuzin lol cr...
78,1835259469,4,@AnnaSaccone Love your new cards! I would de...
79,1983068285,4,@supricky06 that was one of the most enjoyable...
80,1559842363,4,Dallas vegas goodness http://twitpic.com/3lzt...
81,1999078293,4,@JBsFanArgentina Hey I luv this pic!!! was ama...


Consider also the benefit (or otherwise) of removing the Twitter "at" symbol and name. Use a regular expression to achieve this.

In [7]:
df['tweet'][294]

'@spicebugsmom A few more for you to follow: @aims7 @alirushton @dillyh @Oprah @timescolonist @JohnCleese ...Welcome to Twitter mums '

In [8]:
pattern = r'@[A-Za-z0-9]*'
regex = re.compile(pattern, flags=re.IGNORECASE)
for i in range(290,300):
    print i, regex.findall(df['tweet'][i])

290 []
291 []
292 []
293 []
294 ['@spicebugsmom', '@aims7', '@alirushton', '@dillyh', '@Oprah', '@timescolonist', '@JohnCleese']
295 []
296 []
297 ['@mishacollins', '@oprah']
298 ['@DaRealSunisaKim']
299 []


In [9]:
print regex.split(df['tweet'][294])[1]
print regex.split(df['tweet'][294])[7]

 A few more for you to follow: 
 ...Welcome to Twitter mums 


In [10]:
df['tweetrhtag'] = df.tweet.apply(lambda x: regex.sub('',x))

In [11]:
df['tweetrhtag'][294]

' A few more for you to follow:       ...Welcome to Twitter mums '

In [12]:
df.head()

Unnamed: 0,id,polarity,tweet,tweetrhtag
0,1467933112,0,the angel is going to miss the athlete this we...,the angel is going to miss the athlete this we...
1,2323395086,0,It looks as though Shaq is getting traded to C...,It looks as though Shaq is getting traded to C...
2,1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH,APRIL 9TH ISN'T COMING SOON ENOUGH
3,1990283756,0,drinking a McDonalds coffee and not understand...,drinking a McDonalds coffee and not understand...
4,1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...,So dissapointed Taylor Swift doesnt have a Twi...


In [13]:
df[df['polarity']==4].head()

Unnamed: 0,id,polarity,tweet,tweetrhtag
77,1680347120,4,@ mcdonalds with my litto sis aka cuzin lol cr...,mcdonalds with my litto sis aka cuzin lol cri...
78,1835259469,4,@AnnaSaccone Love your new cards! I would de...,Love your new cards! I would definitely hir...
79,1983068285,4,@supricky06 that was one of the most enjoyable...,that was one of the most enjoyable experience...
80,1559842363,4,Dallas vegas goodness http://twitpic.com/3lzt...,Dallas vegas goodness http://twitpic.com/3lzt...
81,1999078293,4,@JBsFanArgentina Hey I luv this pic!!! was ama...,Hey I luv this pic!!! was amazing of the last...


####Start with a Naive Bayes model using the Count Vectorizer 

In [14]:
vectorizer = CountVectorizer()

In [15]:
vectorizer.fit(df['tweet'])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
ll=vectorizer.get_feature_names()
for i in range(0,10):
    print "{:25s} {:50s}".format(ll[i], ll[i+500])
len(ll)

00                        back                                              
000                       backed                                            
000gbp                    backstage                                         
00am                      backup                                            
06                        bad                                               
06h8t                     badass                                            
08tv6                     badboyofopera                                     
09                        badly                                             
0gb                       bag                                               
10                        bah                                               


5457

In [17]:
X = vectorizer.transform(df['tweetrhtag'])

In [18]:
y = (df['polarity'] == 0).values.astype(np.int)

In [19]:
print X.shape
print y.shape

(2034, 5457)
(2034,)


In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.5, random_state=15)

In [21]:
def accuracy_report(clf, xtrain, ytrain, xtest, ytest):
    print "Accuracy (model reported score): {:0.2f}".format(100.0 * clf.score(xtest, ytest))

    #Print the accuracy on the test and training dataset
    training_accuracy = clf.score(xtrain, ytrain)
    test_accuracy = clf.score(xtest, ytest)

    print "Accuracy on training data: {:0.2f}".format(100.0 *training_accuracy)
    print "Accuracy on test data: {:0.2f}".format(100.0 *test_accuracy)

####Try a Naive Bayes Multinomial model

In [22]:
print "Multinomial"
clf_mn = MultinomialNB().fit(xtrain, ytrain)
accuracy_report(clf_mn, xtrain, ytrain, xtest, ytest)

Multinomial
Accuracy (model reported score): 81.12
Accuracy on training data: 94.59
Accuracy on test data: 81.12


In [23]:
pd.crosstab(ytest, clf_mn.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,155,161
1,31,670


####Try a Naive Bayes Bernoulli model

In [24]:
print "bernoulli"
clf_bn = BernoulliNB(binarize = 0.0).fit(xtrain, ytrain)
accuracy_report(clf_bn, xtrain, ytrain, xtest, ytest)

bernoulli
Accuracy (model reported score): 71.39
Accuracy on training data: 79.94
Accuracy on test data: 71.39


In [25]:
print len(clf_bn.predict(xtest))
print clf_bn.predict(xtest).sum()
clf_bn.score(xtest, ytest)

1017
988


0.71386430678466073

In [26]:
pd.crosstab(ytest, clf_bn.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,27,289
1,2,699


In [27]:
def AnalyzeTweet(test_tweet, vectorizer, clf):
    print "\""  + test_tweet + "\" is judged by clasifier to be..."
    test_tweet = vectorizer.transform([test_tweet])

    if (clf.predict(test_tweet)[0] == 0):
        print "... Positive Tweet."
    else:
        print "... Negative Tweet."
    return(clf.predict(test_tweet)[0])

####Here are some good sentiment phrases correctly classified

In [28]:
#compare 'have a good day' with 'a good day'
print AnalyzeTweet("a good day", vectorizer, clf_mn)
print AnalyzeTweet("this is fantastic", vectorizer, clf_mn)
print AnalyzeTweet("congrats on the new job", vectorizer, clf_mn)

"a good day" is judged by clasifier to be...
... Positive Tweet.
0
"this is fantastic" is judged by clasifier to be...
... Positive Tweet.
0
"congrats on the new job" is judged by clasifier to be...
... Positive Tweet.
0


####Here are some negative sentiment phrases correctly classified

In [29]:
print AnalyzeTweet("what a pain that was", vectorizer, clf_mn)
print AnalyzeTweet("back luck about the interveiw", vectorizer, clf_mn)
#compare 'hopefully time' with 'hopefully next time'
print AnalyzeTweet("hopefully time", vectorizer, clf_mn)

"what a pain that was" is judged by clasifier to be...
... Negative Tweet.
1
"back luck about the interveiw" is judged by clasifier to be...
... Negative Tweet.
1
"hopefully time" is judged by clasifier to be...
... Negative Tweet.
1


---
Question
=====
***
1. Re-run the models using the hashtage removed tweets. Which is the more accurate classifer?
2. Come up with a phrase where the classifier gets it completely wrong!