<font color="#5642C2" size="5">Excersise 2 in Natural Language Processing</font>
<br>
<font color="#5642C2" size="4">Tweet Classification and Sentiment Analysis</font>
<br>
<font color="#5642C2" size="3">Author:</font><font color="#ADADAD" size="3">Angeliki Mylonaki</font>
<br>


In [1]:
import csv
import glob
import os

def getData(regex):
    trainFiles = glob.glob(os.path.join(".", regex)) #make list of paths

    tsvreader=[]

    for fd in trainFiles:
        with open(fd) as tsvfile:
            tsvreader.extend(csv.reader(tsvfile, delimiter="\t"))

    return [line[2] for line in tsvreader], [line[1] for line in tsvreader]
            
def peekOnData(x,y):
    print "Previewing data"
    print "....................................."
    for i in range(1,10):
        print ">> "+ y[i] +": "+x[i]+"\n"
    


<font color="#5642C2" size="5">Tweet Classifier</font>

<font color="#cc00ff" size="3">Reading +peeking on all training data</font>

In [4]:
trainX,trainY=getData("*train*.tsv")
devX,devY=getData("*dev*.tsv")
testX,testY=getData("*test*.tsv")

peekOnData(testX[:10],testY[:10])

Previewing data
.....................................
>> neutral: @fakethom Have android tab and don't use phone much, in fact very little. May go the Sony route then:-)

>> positive: Finally I get my ps4 back I sent it to Sony cause my HDMI was mess up now I can play MG's Tuesday yeaaaaa buddy

>> neutral: Sony's 1st teaser package for the launch of the original Playstation seems to feature a dominatrix? https://t.co/xbisCRkPL4 #MistressSophia

>> neutral: #tv Ind vs SL 3rd Test Day 3: Cricket live score and Sony Six live streaming info: Watch the live teleca... http://t.co/mUlHw4cN00 #Sony

>> neutral: @TruthInsider @bertymufc @gamerxone720 @PNF4LYFE @Yanks2013 @VirtuaMe Lol it's all about Sony Sony Sony, if Sony gave Bj's u be the 1st

>> positive: When you remember Sony is trying to make bible study mandatory on Sunday nights @xo_taylorbang http://t.co/CfljDMvMv3

>> positive: @InfinityInq Everyone is playing 3.0 and I'm just sitting here playing 2.0 because Sony is making me wait 

<font color="#cc00ff" size="3">Mining Tweets</font>

In [5]:
from nltk import corpus
from nltk import word_tokenize
from nltk import sent_tokenize
import string
import re
from autocorrect import spell


def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def mineTweets(tweetList):
    minedList=[]
    
    stopWords = corpus.stopwords.words("english")
    tweetList=[s.translate(None, string.punctuation) for s in tweetList]
    
    for tweet in tweetList:
        
        #splitting tweet into phrases
        sentences = sent_tokenize(tweet)
        for sentence in sentences:
            #removing digits
            sentence=sentence.translate(None, string.digits)
            #removing urls
            sentence=re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            #getting words of tweet phrase
            words=word_tokenize(sentence)
            
            #fixing spelling 
            words=[reduce_lengthening(word) for word in words]
            #should uncomment that, but it introduces HUGE DELAY
            #words=[spell(word) for word in words]

            
            #removing stopwords
            sentence = ' '.join([word for word in words
                             if word not in stopWords])
            
            minedList.append(sentence)
            
    return minedList

trainX=mineTweets(trainX)
trainX=trainX[:100]
trainY=trainY[:100]
testX=mineTweets(testX)
testX=testX[:100]
testY=testY[:100]
devX=mineTweets(devX)
devX=devX[:100]
devY=devY[:100]

<font color="#cc00ff" size="3">Vectorizing</font>

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english", binary="true")
X_train_counts = count_vect.fit_transform(trainX)
X_dev_counts = count_vect.transform(devX)
X_test_counts=  count_vect.transform(testX)

<font color="#cc00ff" size="3"> Calculating TF-IDF</font>

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_dev_tfidf = tfidf_transformer.transform(X_dev_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

<font color="#cc00ff" size="3"> Training Multiple Classifiers</font>

In [8]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score



classifiers=[[SGDClassifier(),"Linear Gradient Descent"],
             [RandomForestClassifier(),"Random Forests"],
             [MultinomialNB(),"Multinomial Naive Bayes"],
             [KNeighborsClassifier(),"KNN"],
             [PassiveAggressiveClassifier(),"PassiveAggressive"],
             [LinearSVC(),"LinearSVC"]]

for clf in classifiers:
    text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', clf[0]),
    ])
    text_clf.fit(trainX, trainY)  
    predicted = text_clf.predict(testX)
    print str(clf[1]+": ").ljust(30),text_clf.score(testX, testY)




Linear Gradient Descent:       0.44
Random Forests:                0.47
Multinomial Naive Bayes:       0.48
KNN:                           0.35
PassiveAggressive:             0.41
LinearSVC:                     0.44


<font color="#cc00ff" size="3"> Fine Tuning LinearSVC</font>

In [9]:
#tuning hyperParameters
from sklearn.model_selection import RandomizedSearchCV
import scipy 

#Using RandomizedSearch to tube SVN hyoeroaraneter C

param_grid = {'C': scipy.stats.expon(scale=100), 
              'dual': [True,False],
              'tol': scipy.stats.expon(scale=.1)}

rsearch = RandomizedSearchCV(estimator=LinearSVC(), param_distributions=param_grid, n_iter=100, random_state=7)
rsearch.fit(X_dev_tfidf,devY)

predicted = rsearch.predict(X_test_tfidf)
print str("LinearSVC tuned: ").ljust(30),np.mean(predicted == testY)

LinearSVC tuned:               0.84


<font color="#5642C2" size="5">Introducing Sentiment Analysis</font>

<font color="#cc00ff" size="3">Function to calculate polarity for all Tweets</font>

In [46]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.classifiers import DecisionTreeClassifier
from textblob.classifiers import MaxEntClassifier

def generateInputforClassification(x,y):
    objList=[]
    for i,j in zip(x,y):
        objList.append((i,j))
    return objList

def createTextBlobs(cl,tweets):
    objList=[]
    for tweet in tweets:
        twt=TextBlob(tweet,classifier=cl)
        objList.append(twt)
    return objList


<font color="#cc00ff" size="3">A bit more mining</font>

In [57]:
def calculatePolarity(tweets):
    predicted=[]
    
    for tweet in tweets:
        if tweet.classify()=="positive":
            predicted.append("positive")
        elif tweet.classify()=="negative":
            predicted.append("negative")
        else:
            predicted.append("neutral")
    

    return predicted


def polarityDistribution(lst):
    print "Negative tweets percentage: ".ljust(30),100*float(lst.count("negative"))/len(lst),"%"
    print "Neutral tweets percentage: ".ljust(30), 100*float(lst.count("neutral"))/len(lst),"%"
    print "Positive tweets percentage: ".ljust(30),100*float(lst.count("positive"))/len(lst),"%"
    
def evaluateResults(clName,predicted,testX,testY,original=False):
    if not original:
        docs=generateInputforClassification(testX,testY)
        predicted=calculatePolarity(textBlobs)

        print "Classifier: "+ clName
        print "Sentiment Analysis Result: polarity"
        print "..................................."
        polarityDistribution(predicted)
    else:
        print "\n\nActual Distribution: polarity"
        print "..................................."
        polarityDistribution(testY)

   




Sentiment Analysis Result: polarity
...................................
Negative tweets percentage:    5.0 %
Neutral tweets percentage:     0.0 %
Positive tweets percentage:    95.0 %


Actual Distribution: polarity
...................................
Negative tweets percentage:    10.0 %
Neutral tweets percentage:     40.0 %
Positive tweets percentage:    50.0 %


TextBlob accuracy:           0.48


In [59]:
docs=generateInputforClassification(trainX,trainY)

classifiers=[["NaiveBayesClassifier",NaiveBayesClassifier(docs)],
             ["DecisionTreeClassifier",DecisionTreeClassifier(docs)],
             ["MaxEntClassifier",MaxEntClassifier(docs)]]


for cl in classifiers: 
    textBlobs=createTextBlobs(cl[1],testX)
    evaluateResults(cl[0],predicted,testX,testY)

TypeError: evaluateResults() takes exactly 3 arguments (4 given)