In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

---
Tweet Sentiment Analysis
=====
***

###Create a twitter classifier with Naive Bayes and check the sentiment with a keyword of your choice
###Use tweet_training.csv as your training set
###Sentiment is described as 'polarity', where
### - '0' = negative
### - '4' = positive

In [2]:
df = pd.read_csv('/Users/mrgholt/GADS-22-NYC/Datasets/tweet_training.csv', delimiter=';')

In [3]:
df[df['polarity']==4].shape

In [4]:
df[df['polarity']==0].shape

####The data is "unbalanced" in the sense that the number of 0 polarity tweets outnumbers the 4 polarity tweets. 

- This represents different priors. 

- Sklean's Naive Bayes modeling takes into account the different class representations, i.e. piors for you

- BUT you can alter the model by inputting your own priors ('fit_prior')

- NB: LaPlacian Correction. In addition these algorithms use a very slightly modified formula so as to account for zero probabilities. This is referred to as 'smoothing priors' - ('alpha')

In [5]:
df.head()

In [6]:
df[df['polarity']==4].head()

Consider also the benefit (or otherwise) of removing the Twitter "at" symbol and name. Use a regular expression to achieve this.

In [7]:
df['tweet'][294]

In [8]:
pattern = r'@[A-Za-z0-9]*'
regex = re.compile(pattern, flags=re.IGNORECASE)
for i in range(290,300):
    print i, regex.findall(df['tweet'][i])

In [9]:
print regex.split(df['tweet'][294])[1]
print regex.split(df['tweet'][294])[7]

In [10]:
df['tweetrhtag'] = df.tweet.apply(lambda x: regex.sub('',x))

In [11]:
df['tweetrhtag'][294]

In [12]:
df.head()

In [13]:
df[df['polarity']==4].head()

####Start with a Naive Bayes model using the Count Vectorizer 

In [14]:
vectorizer = CountVectorizer()

In [15]:
vectorizer.fit(df['tweet'])

In [16]:
ll=vectorizer.get_feature_names()
for i in range(0,10):
    print "{:25s} {:50s}".format(ll[i], ll[i+500])
len(ll)

In [17]:
X = vectorizer.transform(df['tweetrhtag'])

In [18]:
y = (df['polarity'] == 0).values.astype(np.int)

In [19]:
print X.shape
print y.shape

In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.5, random_state=15)

In [21]:
def accuracy_report(clf, xtrain, ytrain, xtest, ytest):
    print "Accuracy (model reported score): {:0.2f}".format(100.0 * clf.score(xtest, ytest))

    #Print the accuracy on the test and training dataset
    training_accuracy = clf.score(xtrain, ytrain)
    test_accuracy = clf.score(xtest, ytest)

    print "Accuracy on training data: {:0.2f}".format(100.0 *training_accuracy)
    print "Accuracy on test data: {:0.2f}".format(100.0 *test_accuracy)

####Try a Naive Bayes Multinomial model

In [22]:
print "Multinomial"
clf_mn = MultinomialNB().fit(xtrain, ytrain)
accuracy_report(clf_mn, xtrain, ytrain, xtest, ytest)

In [23]:
pd.crosstab(ytest, clf_mn.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

####Try a Naive Bayes Bernoulli model

In [24]:
print "bernoulli"
clf_bn = BernoulliNB(binarize = 0.0).fit(xtrain, ytrain)
accuracy_report(clf_bn, xtrain, ytrain, xtest, ytest)

In [25]:
print len(clf_bn.predict(xtest))
print clf_bn.predict(xtest).sum()
clf_bn.score(xtest, ytest)

In [26]:
pd.crosstab(ytest, clf_bn.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

In [27]:
def AnalyzeTweet(test_tweet, vectorizer, clf):
    print "\""  + test_tweet + "\" is judged by clasifier to be..."
    test_tweet = vectorizer.transform([test_tweet])

    if (clf.predict(test_tweet)[0] == 0):
        print "... Positive Tweet."
    else:
        print "... Negative Tweet."
    return(clf.predict(test_tweet)[0])

####Here are some good sentiment phrases correctly classified

In [28]:
#compare 'have a good day' with 'a good day'
print AnalyzeTweet("a good day", vectorizer, clf_mn)
print AnalyzeTweet("this is fantastic", vectorizer, clf_mn)
print AnalyzeTweet("congrats on the new job", vectorizer, clf_mn)

####Here are some negative sentiment phrases correctly classified

In [29]:
print AnalyzeTweet("what a pain that was", vectorizer, clf_mn)
print AnalyzeTweet("back luck about the interveiw", vectorizer, clf_mn)
#compare 'hopefully time' with 'hopefully next time'
print AnalyzeTweet("hopefully time", vectorizer, clf_mn)

---
Question
=====
***
1. Re-run the models using the hashtage removed tweets. Which is the more accurate classifer?
2. Come up with a phrase where the classifier gets it completely wrong!