In [216]:
import csv
import glob
import os

def getData(regex):
    trainFiles = glob.glob(os.path.join(".", regex)) #make list of paths

    tsvreader=[]

    for fd in trainFiles:
        with open(fd) as tsvfile:
            tsvreader.extend(csv.reader(tsvfile, delimiter="\t"))

    return [line[2] for line in tsvreader], [line[1] for line in tsvreader]
            
def peekOnData(x,y):
    print "Previewing data"
    print "....................................."
    for i in range(1,10):
        print ">> "+ y[i] +": "+x[i]+"\n"
    


<font color="#cc00ff" size="3">Reading +peeking on all training data</font>

In [217]:
trainX,trainY=getData("*train*.tsv")
devX,devY=getData("*dev*.tsv")
testX,testY=getData("*test*.tsv")

peekOnData(testX[:10],testY[:10])

Previewing data
.....................................
>> neutral: @fakethom Have android tab and don't use phone much, in fact very little. May go the Sony route then:-)

>> positive: Finally I get my ps4 back I sent it to Sony cause my HDMI was mess up now I can play MG's Tuesday yeaaaaa buddy

>> neutral: Sony's 1st teaser package for the launch of the original Playstation seems to feature a dominatrix? https://t.co/xbisCRkPL4 #MistressSophia

>> neutral: #tv Ind vs SL 3rd Test Day 3: Cricket live score and Sony Six live streaming info: Watch the live teleca... http://t.co/mUlHw4cN00 #Sony

>> neutral: @TruthInsider @bertymufc @gamerxone720 @PNF4LYFE @Yanks2013 @VirtuaMe Lol it's all about Sony Sony Sony, if Sony gave Bj's u be the 1st

>> positive: When you remember Sony is trying to make bible study mandatory on Sunday nights @xo_taylorbang http://t.co/CfljDMvMv3

>> positive: @InfinityInq Everyone is playing 3.0 and I'm just sitting here playing 2.0 because Sony is making me wait 

<font color="#cc00ff" size="3">Mining Tweets</font>

In [218]:
import nltk
import string
import re
from autocorrect import spell


def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def mineTweets(tweetList):
    minedList=[]
    
    stopWords = nltk.corpus.stopwords.words("english")
    tweetList=[s.translate(None, string.punctuation) for s in tweetList]
    
    for tweet in tweetList:
        
        #splitting tweet into phrases
        sentences = nltk.sent_tokenize(tweet)
        for sentence in sentences:
            
            #getting words of tweet phrase
            words=nltk.word_tokenize(sentence)
            
            #fixing spelling 
            words=[reduce_lengthening(word) for word in words]
            #should uncoment that, but it introduces HUGE DELAY
            #words=[spell(word) for word in words]

            
            #removing stopwords
            sentence = ' '.join([word for word in words
                             if word not in stopWords])
            
            minedList.append(sentence)
            
    return minedList

trainX=mineTweets(trainX)
testX=mineTweets(testX)
devX=mineTweets(devX)

<font color="#cc00ff" size="3">Vectorizing</font>

In [223]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english", binary="true")
X_dev_counts = count_vect.fit_transform(devX)
X_test_counts=  count_vect.transform(testX)

<font color="#cc00ff" size="3"> Calculating TF-IDF</font>

In [224]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_dev_tfidf = tfidf_transformer.transform(X_dev_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

<font color="#cc00ff" size="3"> Training Multiple Classifiers</font>

In [262]:
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC


classifiers=[[SGDClassifier(),"Linear Gradient Descent"],[RandomForestClassifier(),"Random Forests"],
             [MultinomialNB(),"Multinomial Naive Bayes"],[KNeighborsClassifier(),"KNN"],
             [NearestCentroid,"NearestCentroid"],[PassiveAggressiveClassifier(),"PassiveAggressive"],
             [LinearSVC(),"LinearSVC"]]

for clf in classifiers:
    ext_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', clf[0]),
    ])
    text_clf.fit(trainX, trainY)  
    predicted = text_clf.predict(testX)
    print str(clf[1]+": ").ljust(30),np.mean(predicted == testY)


Linear Gradient Descent:       0.589858226955
Random Forests:                0.589328784046
Multinomial Naive Bayes:       0.59127007471
KNN:                           0.591975998588
NearestCentroid:               0.588916995117
PassiveAggressive:             0.590564150832
LinearSVC:                     0.59121124772


<font color="#cc00ff" size="3"> Fine Tuning LinearSVC</font>

In [269]:
#tuning hyperParameters
from sklearn.model_selection import RandomizedSearchCV
import scipy 

#Using RandomizedSearch to tube SVN hyoeroaraneter C

param_grid = {'C': scipy.stats.expon(scale=100), 
              'dual': [True,False],
              'tol': scipy.stats.expon(scale=.1)}

rsearch = RandomizedSearchCV(estimator=LinearSVC(), param_distributions=param_grid, n_iter=100, random_state=7)
rsearch.fit(X_dev_tfidf,devY)

predicted = rsearch.predict(X_test_tfidf)
print str("LinearSVC tuned: ").ljust(30),np.mean(predicted == testY)

LinearSVC tuned:               0.508971115948
