<font color="#5642C2" size="5">Excersise 2 in Natural Language Processing</font>
<br>
<font color="#5642C2" size="4">Tweet Classification and Sentiment Analysis</font>
<br>
<font color="#5642C2" size="3">Author:</font><font color="#ADADAD" size="3">Angeliki Mylonaki</font>
<br>


In [1]:
import csv
import glob
import os

def getData(regex):
    trainFiles = glob.glob(os.path.join(".", regex)) #make list of paths

    tsvreader=[]

    for fd in trainFiles:
        with open(fd) as tsvfile:
            tsvreader.extend(csv.reader(tsvfile, delimiter="\t"))

    return [line[2] for line in tsvreader], [line[1] for line in tsvreader]
            
def peekOnData(x,y):
    print "Previewing data"
    print "....................................."
    for i in range(1,10):
        print ">> "+ y[i] +": "+x[i]+"\n"
    


<b><font color="#5642C2" size="5" >Tweet Classifier</font></b>
<br>
<font color="#5642C2" size="3">Reading +peeking on all training data</font>


In [96]:
trainX,trainY=getData("*train*.tsv")
devX,devY=getData("*dev*.tsv")
testX,testY=getData("*test*.tsv")

#peekOnData(testX[:10],testY[:10])

<font color="#5642C2" size="3">Mining Tweets</font>

In [82]:
from nltk import corpus
from nltk import word_tokenize
from nltk import sent_tokenize
import string
import re
from autocorrect import spell


def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def mineTweets(tweetList):
    minedList=[]
    
    stopWords = corpus.stopwords.words("english")
    tweetList=[s.translate(None, string.punctuation) for s in tweetList]
    
    for tweet in tweetList:
        
        #splitting tweet into phrases
        sentences = sent_tokenize(tweet)
        for sentence in sentences:
            #removing digits
            sentence=sentence.translate(None, string.digits)
            #removing urls
            sentence=re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
            #getting words of tweet phrase
            words=word_tokenize(sentence)
            
            #fixing spelling 
            words=[reduce_lengthening(word) for word in words]

            #Poor laptop could not handle spell check
            #words=[spell(word) for word in words]

            
            #removing stopwords
            sentence = ' '.join([word for word in words
                             if word not in stopWords])
            
            minedList.append(sentence)
            
    return minedList

trainX=mineTweets(trainX)
testX=mineTweets(testX)
devX=mineTweets(devX)

#had to drop a lot of data for efficiency.
#Feel free to comment the list slices :)
trainX=trainX[:100]
trainY=trainY[:100]
testX=testX[:100]
testY=testY[:100]
devX=devX[:100]
devY=devY[:100]

<font color="#5642C2" size="3">Vectorizing </font><font color="#ADADAD" size="3">(used in textblob solution)</font>

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english", binary="true")
X_train_counts = count_vect.fit_transform(trainX)
X_dev_counts = count_vect.transform(devX)
X_test_counts=  count_vect.transform(testX)

<font color="#5642C2" size="3">TF-IDF </font><font color="#ADADAD" size="3">(used in textblob solution)</font>

In [84]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_dev_tfidf = tfidf_transformer.transform(X_dev_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

<font color="#5642C2" size="3"> Training Multiple Classifiers</font>

In [85]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score



classifiers=[[SGDClassifier(),"Linear Gradient Descent"],
             [RandomForestClassifier(),"Random Forests"],
             [MultinomialNB(),"Multinomial Naive Bayes"],
             [KNeighborsClassifier(),"KNN"],
             [LinearSVC(),"LinearSVC"]]

for clf in classifiers:
    text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', clf[0]),
    ])
    text_clf.fit(trainX, trainY)  
    predicted = text_clf.predict(testX)
    print str(clf[1]+": ").ljust(30),text_clf.score(testX, testY)


Linear Gradient Descent:       0.44
Random Forests:                0.49
Multinomial Naive Bayes:       0.48
KNN:                           0.35
PassiveAggressive:             0.43
LinearSVC:                     0.44


<font color="#5642C2" size="3"> Fine Tuning LinearSVC</font>

In [86]:
#tuning hyperParameters
from sklearn.model_selection import RandomizedSearchCV
import scipy 

#Using RandomizedSearch to tube SVN hyoeroaraneter C

param_grid = {'C': scipy.stats.expon(scale=100), 
              'dual': [True,False],
              'tol': scipy.stats.expon(scale=.1)}

rsearch = RandomizedSearchCV(estimator=LinearSVC(), param_distributions=param_grid, n_iter=100, random_state=7)
rsearch.fit(X_dev_tfidf,devY)

predicted = rsearch.predict(X_test_tfidf)
print str("LinearSVC tuned: ").ljust(30),np.mean(predicted == testY)

LinearSVC tuned:               0.84


<b><font color="#5642C2" size="5">Introducing Sentiment Analysis</font></b>
<br>
<font color="#5642C2" size="3">Mining Textblob Objects function definitions</font>

In [89]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.classifiers import DecisionTreeClassifier
from textblob.classifiers import MaxEntClassifier

def generateInputforClassification(x,y):
    objList=[]
    for i,j in zip(x,y):
        objList.append((i,j))
    return objList

def createTextBlobs(cl,tweets):
    objList=[]
    for tweet in tweets:
        twt=TextBlob(tweet,classifier=cl)
        objList.append(twt)
    return objList


<font color="#5642C2" size="3">Evaluation funtion definitions</font>

In [93]:
def polarityDistribution(lst):
    print "    Negative tweets percentage: ".ljust(35),100*float(lst.count("negative"))/len(lst),"%"
    print "    Neutral tweets percentage: ".ljust(35), 100*float(lst.count("neutral"))/len(lst),"%"
    print "    Positive tweets percentage: ".ljust(35),100*float(lst.count("positive"))/len(lst),"%"
    
def evaluateResults(clName,cl,predicted,testX,testY):
    docs=generateInputforClassification(testX,testY)
    predicted=calculatePolarity(textBlobs)

    print ">>Classifier: "+ clName
    polarityDistribution(predicted)
    print "    Accuracy: ".ljust(35),cl.accuracy(docs),"\n\n"

def summarizeOriginalData(data):
    print "\n\nActual Distribution: polarity"
    print "..................................."
    polarityDistribution(data)

def calculatePolarity(tweets):
    predicted=[]
    
    for tweet in tweets:
        if tweet.classify()=="positive":
            predicted.append("positive")
        elif tweet.classify()=="negative":
            predicted.append("negative")
        else:
            predicted.append("neutral")
    
    return predicted
   

<font color="#5642C2" size="3">Training Algortithms</font>

In [94]:
docs=generateInputforClassification(trainX,trainY)

classifiers=[["NaiveBayesClassifier",NaiveBayesClassifier(docs)],
             ["DecisionTreeClassifier",DecisionTreeClassifier(docs)],
             ["MaxEntClassifier",MaxEntClassifier(docs)]]

<font color="#5642C2" size="3">Evaluating Results</font>

In [95]:
print "Sentiment Analysis Result"
print "..................................."
for cl in classifiers: 
    textBlobs=createTextBlobs(cl[1],testX)
    evaluateResults(cl[0],cl[1],predicted,testX,testY)

summarizeOriginalData(testY)

Sentiment Analysis Result
...................................
>>Classifier: NaiveBayesClassifier
    Negative tweets percentage:     5.0 %
    Neutral tweets percentage:      0.0 %
    Positive tweets percentage:     95.0 %
    Accuracy:                       0.48 


>>Classifier: DecisionTreeClassifier
    Negative tweets percentage:     8.0 %
    Neutral tweets percentage:      7.0 %
    Positive tweets percentage:     85.0 %
    Accuracy:                       0.48 


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.520
             2          -0.04879        1.000
             3          -0.00385        1.000
             4          -0.00054        1.000
             5          -0.00010        1.000
             6          -0.00002        1.000
             7          -0.00000        1.000
             8          -0.00000        1.000
             9          -0.0000