# Target Dependent Twitter Sentiment Predictions

By Gautam Borgohain 

In [1]:
# All imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_curve, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation
from pycorenlp import StanfordCoreNLP

from sklearn.externals import joblib

lexicon_path = '/Users/gautamborgohain/PycharmProjects/DataScience/Twitter_target_dependent_SA/subjectivity.csv'


In [2]:
pd.options.display.max_rows = None
pd.options.display.notebook_repr_html = True
pd.options.display.max_columns = None

data = pd.read_excel('/Users/gautamborgohain/Desktop/Tweets_labeled_325.xlsx')


In [3]:
data.index = np.arange(len(data))
y = data.Sentiment

X = data.Tweet

In [4]:
# Funciton to create the dummy variableso of the features that are extracted

def getFeatureDF(feature_list):
    vectorizer = CountVectorizer()
    docmatrix = vectorizer.fit_transform(feature_list).toarray()
    columns = vectorizer.get_feature_names()
    columns = [word.upper() for word in columns]  # uppercasing to avoid conflict of in and other words
    df = pd.DataFrame(data=docmatrix, columns=columns)
    return df

# Regex word normalization

In [5]:
import re
def regexStuff(tweet):
#     tweet = re.sub(r'@SMRT_singapore|@smrt_singapore|@smrt|@SMRT_Singapore','TARGET',tweet)
    tweet = re.sub(r'@[^ ]*','TARGET',tweet)
    #Clear the http and other characters that are causing problems
    tweet = re.sub('((www\.[^ ]+)|(https?://[^ ]+))', '', tweet)
    tweet = re.sub(r'http?[^ ]+','',tweet)
    tweet = re.sub(r'[\n]','',tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('[\.]+', '.', tweet)
    tweet = re.sub('…','',tweet)
    tweet = re.sub('[-—]','',tweet)
    tweet = re.sub(r'&gt;|&amp;|&lt;','',tweet)
    #Substitue common shorthands with the appropriate words for POS tagging and type dependecy to work
    tweet = re.sub(r' u ',' you ',tweet)
    tweet = re.sub(r' n ',' no ',tweet)
    tweet = re.sub(r' y ',' why ',tweet)
    tweet = re.sub(r' nt ',' not ',tweet)
    tweet = re.sub(r' dwn ',' down ',tweet)
    tweet = re.sub(r' frver ',' forever ',tweet)
    tweet = re.sub(r' bc ',' because ',tweet)
    tweet = re.sub(r' bcoz ',' because ',tweet)
    tweet = re.sub(r' cuz ',' because ',tweet)
    tweet = re.sub(r' im ',' I am ',tweet)
    tweet = re.sub(r' zzz ',' sleep ',tweet)
    tweet = re.sub(r' y\'all ',' you all ',tweet)
    #Set the hash tags
    tweet = re.sub(r'#','HASH_',tweet)
    return tweet


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class regExProcesses(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return [regexStuff(tweet) for tweet in df]


# POS Tags

In [7]:
from nltk import pos_tag
from nltk import word_tokenize

class posTagTweets(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, df):
        tagsoftweet = []
        for tweet in df:
            postaggedtweet = pos_tag(word_tokenize(tweet))  # this one is pos atgged..list inside list : token[1] for tag
            tags = []
            for token in postaggedtweet:
                tags.append(token[1])
            tagsoftweet.append(' '.join(tags))
#             print(' '.join(tags))
            
#         df = getFeatureDF(tagsoftweet)
        return tagsoftweet

# Subjectivity Lexicon - Wilson et al

In [8]:
def cleantweet(tweet):
    tweet = re.sub('url|at_user|rt|\.', '', tweet)  ## removing these from the tweets
    return tweet

class subjectivityLexicon(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, df):
        lexicon = pd.read_csv(lexicon_path)
        tweet_tags = []
        count_tweet = 1
        for tweet in df:
            tweet = cleantweet(tweet)
            typeList = []
            priorpolarityList = []
            count_word = 0  
            count_tweet += 1
            for word in word_tokenize(tweet):
                result = lexicon[lexicon.word1 == word]
                if len(result) != 0:  # word is there in the lexicon
                    if len(result) == 1:  # this case is handling the ones where the there is only one record of the word
                        typeList.append(result.iloc[0][0])
                        priorpolarityList.append(result.iloc[0][5])
                    if len(result) > 1:  
    #                     print('Have to tag POS, Hold On!')
                        poslist = pos_tag(word_tokenize(tweet))#Tag the tweet
                        postag = poslist[count_word][1]#Using the position of the word, find the POS tag
                        if postag in ['NN', 'NNP', 'NNS',
                                      'NNPS']:  # make the POS tags to the format used by the MPQA lexicon
                            postag = 'noun'
                        elif postag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                            postag = 'verb'
                        elif postag in ['RB', 'RBR', 'RBS']:
                            postag = 'adverb'
                        elif postag in ['JJ', 'JJR', 'JJS']:
                            postag = 'adj'
                        second_result = result[result.pos1 == postag]#get the word-postag combination from the lexicon
                        if len(second_result) != 0:  
                            typeList.append(second_result.iloc[0][0])
                            priorpolarityList.append(second_result.iloc[0][5])

                count_word += 1

            tweet_tags.append(' '.join(typeList) + ' ' + ' '.join(priorpolarityList))

        return tweet_tags


# Target Features

In [9]:
keywords = ['SMRT','smrt','smrt_singapore','SMRT_SINGAPORE','train','mrt','TARGET','people','LRT','lrt']

def get_hastags(tweet):
    hash_tags = re.findall('HASH_([^ ]*)', tweet)
    return hash_tags

def getAdjectives(tweet):
    poslist = pos_tag(word_tokenize(tweet))
    adjectives = []
    for pos in poslist:
        if pos[1] in ['JJ', 'JJR', 'JJS']:
            adjectives.append(pos[0])
            
    return adjectives
            

class enhancedTargetFeats(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, frame):
        tweet_target_features = []
        for tweet in frame:
            tweet = cleantweet(tweet)
            tags = get_hastags(tweet)
    #         keywords = ['SMRT', 'mrt','lrt','LRT', 'MRT', 'smrt', 'Singapore_MRT',"TARGET"]
            tokens = word_tokenize(tweet)  
            targets_feature = []
            for keyword in keywords:
                if keyword in tags:  
                    feature = keyword + '_hash'
                    targets_feature.append(feature)
                if keyword in tokens:
                    adjectives = getAdjectives(tweet)  # This will get all the adjectives, not just one
                    features = []
                    for adjective in adjectives:
                        adjective = re.sub('-', '_',
                                           adjective)  
                        features.append(keyword + '_' + adjective)
                    feature = ' '.join(features)
                    targets_feature.append(feature)

            tweet_target_features.append(' '.join(targets_feature))
        return tweet_target_features

# Lexicon - Hu and Liu

In [10]:
#
# Get the positiv - negative word lexicon
#
positive_lexicon_path = '/Users/gautamborgohain/PycharmProjects/DT_Labs/PLayground/Gautam_Borg/HuLiuLexicon/positive-words.txt'
negative_lexicon_path ='/Users/gautamborgohain/PycharmProjects/DT_Labs/PLayground/Gautam_Borg/HuLiuLexicon/negative-words.txt'
poshand = open(positive_lexicon_path)
neghand = open(negative_lexicon_path)
poslist = []
neglist = []
for line in poshand:
    poslist.append(re.sub(r'\n','',line))
for line in neghand:
    neglist.append(re.sub(r'\n','',line))
print(len(poslist),len(neglist))

poshand.close()
neghand.close()

2006 4783


In [11]:
def getPositiveWordCount(tweet):
    countPos = 0
    for word in word_tokenize(tweet):
        if len(word)>=2 and word in poslist: countPos+=1
    return countPos

def getNegativeWordCount(tweet):
    countNeg = 0
    for word in word_tokenize(tweet):
        if len(word)>=2 and word in neglist: countNeg+=1
    return countNeg

class lexiconSent(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    
    def transform(self, frame):
        df = pd.DataFrame(columns=['POS_LEX','NEG_LEX'])
        df['POS_LEX'] =   [getPositiveWordCount(tweet) for tweet in frame]
        df['NEG_LEX'] =   [getNegativeWordCount(tweet) for tweet in frame]
             
        return df.as_matrix()

# Punctuations

In [12]:

class punctuations(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, frame):
        df = pd.DataFrame()
        df['PUNC_EXCL'] = [len(re.findall(r'!',tweet)) for tweet in frame]
        df['PUNC_QUES'] = [len(re.findall(r'\?',tweet)) for tweet in frame]
        return df.as_matrix()

## Emoticons

In [13]:
# data_unlab_copy['EMOT_HAPPY'] = [len(re.findall(r':-D|:D|:-\)|:\)',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['EMOT_SAD'] = [len(re.findall(r':-\(|:\(',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['EMOT_WINK'] = [len(re.findall(r';-\)|;\)',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['ANGRY_EMO'] = [len(re.findall(r'\U0001F621|\U0001F624|\U0001F63E|\U0001F449|\U0001F44A',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['LOVE_EMO'] = [len(re.findall(r'\U0001F618|\U0001F61A|\U0001F63B|\U0001F63D',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['HAPPY_EMO'] = [len(re.findall(r'\U0001F602|\U0001F603|\U0001F604|\U0001F605|\U0001F606|\U0001F609|\U0001F60A|\U0001F60B|\U0001F60C|\U0001F60D|\U0001F60F|\U0001F612|\U0001F61C|\U0001F61D|\U0001F638|\U0001F639|\U0001F63A|\U0001F63C|\U0001F44C|\U0001F44D|\U0001F44F|\U0001F450|\U0001F451|\U0001F600|\U0001F607|\U0001F608|\U0001F60E|\U0001F617|\U0001F619|\U0001F61B|\U0001f917|\U0001f595|\U0001f389|\U0001f38a',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['NEUTRAL_EMO'] = [len(re.findall(r'U0001F601|\U0001F633|\U0001F645|\U0001F646|\U0001F647|\U0001F648|\U0001F649|\U0001F64A|\U0001F64B|\U0001F64C|\U0001F64D|\U0001F64E|\U0001F64F|\U0001F448|\U0001F44B|\U0001F610|\U0001F611|\U0001F615|\U0001F62C|\U0001F636',tweet)) for tweet in data_unlab_copy['Tweet']]
# data_unlab_copy['SAD_EMO'] = [len(re.findall(r'\U0001F613|\U0001F614|\U0001F616|\U0001F61E|\U0001F620|\U0001F622|\U0001F623|\U0001F625|\U0001F628|\U0001F629|\U0001F62A|\U0001F62B|\U0001F62D|\U0001F630|\U0001F631|\U0001F632|\U0001F635|\U0001F637|\U0001F63F|\U0001F640|\U0001F44E|\U0001f634|\U0001F61F|\U0001F626|\U0001F627|\U0001F62E|\U0001F62F|\U0001F634',tweet)) for tweet in data_unlab_copy['Tweet']]

class emoticons(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, frame):
        df = pd.DataFrame()
        posEmot,negEmot,posEmo,negEmo,neutralEmo = [],[],[],[],[]
        for tweet in frame:
            posEmot.append(len(re.findall(r':-D|:D|:-\)|:\)|;-\)|;\)',tweet)))
            negEmot.append(len(re.findall(r':-\(|:\(',tweet)))
            posEmo.append(len(re.findall(r'\U0001F618|\U0001F61A|\U0001F63B|\U0001F63D|\U0001F602|\U0001F603|\U0001F604|\U0001F605|\U0001F606|\U0001F609|\U0001F60A|\U0001F60B|\U0001F60C|\U0001F60D|\U0001F60F|\U0001F612|\U0001F61C|\U0001F61D|\U0001F638|\U0001F639|\U0001F63A|\U0001F63C|\U0001F44C|\U0001F44D|\U0001F44F|\U0001F450|\U0001F451|\U0001F600|\U0001F607|\U0001F608|\U0001F60E|\U0001F617|\U0001F619|\U0001F61B|\U0001f917|\U0001f595|\U0001f389|\U0001f38a',tweet)))
            negEmo.append(len(re.findall(r'\U0001F621|\U0001F624|\U0001F63E|\U0001F449|\U0001F44A|\U0001F613|\U0001F614|\U0001F616|\U0001F61E|\U0001F620|\U0001F622|\U0001F623|\U0001F625|\U0001F628|\U0001F629|\U0001F62A|\U0001F62B|\U0001F62D|\U0001F630|\U0001F631|\U0001F632|\U0001F635|\U0001F637|\U0001F63F|\U0001F640|\U0001F44E|\U0001f634|\U0001F61F|\U0001F626|\U0001F627|\U0001F62E|\U0001F62F|\U0001F634',tweet)) )
            neutralEmo.append(len(re.findall(r'U0001F601|\U0001F633|\U0001F645|\U0001F646|\U0001F647|\U0001F648|\U0001F649|\U0001F64A|\U0001F64B|\U0001F64C|\U0001F64D|\U0001F64E|\U0001F64F|\U0001F448|\U0001F44B|\U0001F610|\U0001F611|\U0001F615|\U0001F62C|\U0001F636',tweet)) )
            
        df['EMOT_POS'] = posEmot
        df['EMOT_NEG'] = negEmot
        df['EMO_POS'] = posEmo
        df['EMO_NEG'] = negEmo
        df['EMO_NEUTRAL'] = neutralEmo
        
        return df.as_matrix()

## Type Dependency


### Stanford Core NLP dependency tree features

cd stanford-corenlp-full-2015-12-09/

export CLASSPATH="`find . -name '*.jar'`" 

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

In [14]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
# Sample output from the Stanford dependency parser

text = ("U guys better wake up ur idea! People pay so much for ur crappy service. People late for work now. ATSMRT")
output = nlp.annotate(text, properties={
        'annotators': 'parse,relation',
        'outputFormat': 'json'
    })
# print(output)
print(output['sentences'][0]['basic-dependencies'])

[{'dep': 'ROOT', 'governorGloss': 'ROOT', 'dependentGloss': 'guys', 'governor': 0, 'dependent': 2}, {'dep': 'compound', 'governorGloss': 'guys', 'dependentGloss': 'U', 'governor': 2, 'dependent': 1}, {'dep': 'amod', 'governorGloss': 'wake', 'dependentGloss': 'better', 'governor': 4, 'dependent': 3}, {'dep': 'dep', 'governorGloss': 'guys', 'dependentGloss': 'wake', 'governor': 2, 'dependent': 4}, {'dep': 'case', 'governorGloss': 'idea', 'dependentGloss': 'up', 'governor': 7, 'dependent': 5}, {'dep': 'compound', 'governorGloss': 'idea', 'dependentGloss': 'ur', 'governor': 7, 'dependent': 6}, {'dep': 'nmod', 'governorGloss': 'wake', 'dependentGloss': 'idea', 'governor': 4, 'dependent': 7}, {'dep': 'punct', 'governorGloss': 'guys', 'dependentGloss': '!', 'governor': 2, 'dependent': 8}]


In [15]:
def posorneg(word):
    if(len(poslist)>0 and len(neglist)>0):
        word = word.lower()
        if word in poslist:
            return 'POSITIVE'
        elif word in neglist:
            return 'NEGATIVE'
        else: return word

In [16]:
targets = ['SMRT','smrt','smrt_singapore','SMRT_SINGAPORE','train','mrt','TARGET','people','LRT','lrt']
verbs = ['VBG','VB','VBD','VBN','VBP','VBZ']
adjnon = ['JJ','JJR','JJS','NN','NNS','NNP','NNPS']
objects = ['obj','pobj','dobj','iobj','nmod']
adverbs = ['RB','RBR','RBS']
subjs = ['nsubj','dep']

#helper functions
def getDependentGloss(dep,word, depToSearch):
    return dep.get('dependentGloss') if dep.get('governorGloss') == word and dep.get('dep') in depToSearch else ''

def getDependentGloss_WithTargets(dep,word):
    return dep.get('dependentGloss') if dep.get('governorGloss') == word and dep.get('dependentGloss') in targets else ''

def isTransitive(verb,depdicts):
    for dep in depdicts:
        if dep.get('governorGloss')  == verb and dep.get('dep') in objects:
            return True        
    return False

class typeDependency(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, frame):
        dependecyfeatures = []
        for tweet in frame:
            tweet = tweet.encode('utf-8')
            output = nlp.annotate(tweet, properties={
                    'annotators': 'parse,relation',
                    'outputFormat': 'json'
                })
            features = []
            for i in range(len(output['sentences'])):
                posdicts = output['sentences'][i]['tokens']
                depdicts = output['sentences'][i]['basic-dependencies']
                count = 0
                for pos in posdicts:
                    verb = pos.get('word') if pos.get('pos') in verbs else '' # Get Verbs
                    if verb != '':
                        if(isTransitive(verb, depdicts)):
                            for dep in depdicts:                  
                                #Rule 1
                                dependent = getDependentGloss(dep,verb,objects)
                                if dependent != '' and dependent in targets:
                                    features.append(posorneg(verb)+'_arg2')
                                #Rule 2
                                dependent = getDependentGloss(dep,verb,subjs)
                                if (dependent != '' and dependent in targets):
                                    features.append(posorneg(verb)+'_arg1')
                        else:
                            #Rule 3
                            for dep in depdicts:
                                dependent = getDependentGloss(dep,verb,subjs)
                                if dependent != '' and dependent in targets:
                                    features.append(posorneg(verb)+'_it_arg1')
                    #Rule 4            
                    adj_noun = pos.get('word') if pos.get('pos') in adjnon else '' # Get Adjectives and Nouns
                    if adj_noun != '':
                        for dep in depdicts:
                            dependent = getDependentGloss_WithTargets(dep,adj_noun)
                            if dependent != '':features.append(posorneg(adj_noun)+'_arg1')
                    #Rule 7
                    adv = pos.get('word') if pos.get('pos') in adverbs else ''
                    if adv != '':    
                        for dep in depdicts:
                            verb = dep.get('governorGloss') if dep.get('dependentGloss') == adv else ''# Get the verb it modifies
                            if verb != '':
                                for dep1 in depdicts:#Loop again and check the target
                                    dependent = getDependentGloss_WithTargets(dep1,verb)
                                    if dependent != '':
                                        # Rule 8 - negation 
                                        if dep.get('dep') == 'neg':
                                            features.append('arg1_v_neg_'+posorneg(verb))
                                        else:
                                            features.append('arg1_v_'+posorneg(adv))
                #Rule 5
                for dep in depdicts:
                    dependent = dep.get('dependentGloss') if i>0 and dep.get('governorGloss') =='ROOT' and len(features)>0 else ''
                    if dependent != '' : 
                        for pos in posdicts:
                            if pos.get('word') == dependent and pos.get('pos') in adjnon:
                                features.append(posorneg(dependent)+'_arg')
                    # If in the second sentence,for the root, theres is already a target dependent feature in the list 
                    #  suggesting that that target appears in the previous snentence  and it is an adjective or noun


            dependecyfeatures.append(' '.join(features))

        return dependecyfeatures

# Pipeline

Bring all the work together

In [17]:
# Temp for testing the pipeline on jsut 10 records
Xtemp = X[0:10]
Ytemp = y[0:10]

### Dataset partition

In [18]:
perc = 0.6
training_set = data.sample(frac = perc, random_state=0)
testing_set = data.loc[~data.index.isin(training_set.index)]
ytrain = training_set.Sentiment
Xtrain = training_set.Tweet
print(len(Xtrain))

2374


In [19]:
Xtest = testing_set.Tweet
ytest = testing_set.Sentiment
print(len(Xtest))

1583


In [23]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
                    ('lex',lexiconSent()),
                    ('punc',punctuations()),
                    ('emot',emoticons()),
                    ('pos', Pipeline([
                                ('pos',posTagTweets()),
                                ('pos_vect', CountVectorizer())
                            ])),
                    ('subj',Pipeline([
                                ('subj',subjectivityLexicon()),
                                ('subj_vect', CountVectorizer())
                            ])),
                    ('targ',Pipeline([
                                ('targ',enhancedTargetFeats()),
                                ('targ_vect', CountVectorizer())
                            ])),
                    ('typd',Pipeline([
                                ('typd',typeDependency()),
                                ('typd_vect', CountVectorizer())
                            ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)

Pipeline(steps=[('regexProcess', regExProcesses()), ('features', FeatureUnion(n_jobs=1,
       transformer_list=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=500, max_featur...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [None]:
temp_train = pipeline.transform(Xtemp)
temp_train.shape

In [None]:
test_train = pipeline.transform(temp)
test_train.shape

In [24]:
pipeline.score(Xtrain, ytrain)

0.98399326032013479

In [25]:
predictions = pipeline.predict(Xtest)
accuracy_score(predictions,ytest)

0.83891345546430829

### Baseline accuracy of the datset

In [112]:
pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
                    ('lex',lexiconSent()),
                    ('punc',punctuations()),
                    ('emot',emoticons())
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)
print(pipeline.score(Xtrain,ytrain))
predictions = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.960404380792
0.834491471889


Using only the BOW features, the accuracy on the test set was 81% and with the other features (as seen in the pipeline),
the accuracy increased to 83.4%

# Model Evaluation

In [None]:
def evaluateModel(X,y,clf):
    
    predictions_training = clf.predict(X)
#     Calculate the accuracy
    accuracy_training = accuracy_score(y, predictions_training)
#     cv_scores = cross_validation.cross_val_score(clf, X, y, cv=10)
    #Print the accuracy, cross validation scores and the crosstab of the predictions 
    print("Accuracy on the training data : ", accuracy_training)
#     print("Cross Validation Accuracy scores - ",cv_scores)
#     print("Cross Validation Accuracy - Training set: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
    print("Training data Crosstab: \n", pd.crosstab(y, predictions_training))

    # ROC Curve
    y_score = clf.decision_function(X)
    target_testting_dummies = pd.get_dummies(y)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(0,len(np.unique(y))):
        fpr[i], tpr[i], _ = roc_curve(target_testting_dummies[target_testting_dummies.columns[i]], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    plt.plot(fpr[2], tpr[2], label='Positive ROC curve (area = %0.2f)' % roc_auc[2])
    plt.plot(fpr[0], tpr[0], label='Negative ROC curve (area = %0.2f)' % roc_auc[0])
    plt.plot(fpr[1], tpr[1], label='Neutral ROC curve (area = %0.2f)' % roc_auc[1])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()


In [None]:
evaluateModel(Xtemp,Ytemp,pipeline)

# Save to pickle and load it back to test on different datasets

In [26]:
joblib.dump(pipeline,'TwitterSentiment_SMRT_model_All_60.pkl')

['TwitterSentiment_SMRT_model_All_60.pkl',
 'TwitterSentiment_SMRT_model_All_60.pkl_01.npy',
 'TwitterSentiment_SMRT_model_All_60.pkl_02.npy',
 'TwitterSentiment_SMRT_model_All_60.pkl_03.npy']

In [None]:
from sklearn.externals import joblib
pipe = joblib.load('TwitterSentiment_SMRT_model_All_60.pkl')

In [None]:
# pipe.predict(['If you are in Vancouver this weekend, check out @staticstars on Sat. at 20:00 @ The Commo in Vancouver,'])

pipe.predict(Xtemp)

## The Sem Eval dataset

In [27]:
import pandas as pd
df = pd.read_excel('/Users/gautamborgohain/Google Drive/CI/twitter_download20160410.xlsx')
cleaned = df[df.Tweet != 'Not Available']
cleaned.Sentiment[cleaned.Sentiment == 'neutral'] = 0
cleaned.Sentiment[cleaned.Sentiment == 'positive'] = 1
cleaned.Sentiment[cleaned.Sentiment == 'negative'] = -1
cleaned.Sentiment.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


 0    3548
 1    2970
-1    1331
Name: Sentiment, dtype: int64

In [28]:
from sklearn.externals import joblib
pipe = joblib.load('TwitterSentiment_SMRT_model_All_60.pkl')

In [29]:
X = cleaned.Tweet
y = cleaned.Sentiment
len(X)

7849

In [30]:
SE_predicted = pipe.predict(X)

In [94]:
y_native = [int(pred) for pred in y]
type(np.unique(y_native)[0])

numpy.int64

In [85]:
type(np.unique(SE_predicted)[0])

numpy.int64

In [84]:
accuracy_score(SE_predicted,y_native)

0.4181424385272009

Very low accuracy because 
- The training of the model was done on a different dataset and the testing was done on a dataset that was of entirely a differnt domain.
- The data set that the model was trained on was labelled in a target dependent manner
- The model takes into account a static number of targets which is used for generating the target dependent features, which would not be available in this dataset that has nothing to do with SMRT

The problem of the dataset being not labelled properly can also be seen here

In [90]:
temp = ['I do not like SMRT', 'SMRT is just not bad']
pipeline.predict(temp)

array([-1, -1])

## Pipeline to train a model using the SEm Eval data set, without any target dependent features

In [107]:
perc = 0.6
training_set = cleaned.sample(frac = perc, random_state=0)
testing_set = cleaned.loc[~cleaned.index.isin(training_set.index)]
ytrain = training_set.Sentiment
ytrain = [int(label) for label in ytrain]
Xtrain = training_set.Tweet
ytest = testing_set.Sentiment
ytest = [int(label) for label in ytest]
Xtest = testing_set.Tweet
print(len(Xtrain))
print(len(Xtest))

4709
3140


In [100]:
pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
                    ('lex',lexiconSent()),
                    ('punc',punctuations()),
                    ('emot',emoticons()),
                    ('pos', Pipeline([
                                ('pos',posTagTweets()),
                                ('pos_vect', CountVectorizer())
                            ])),
                    ('subj',Pipeline([
                                ('subj',subjectivityLexicon()),
                                ('subj_vect', CountVectorizer())
                            ])),
#                     ('targ',Pipeline([
#                                 ('targ',enhancedTargetFeats()),
#                                 ('targ_vect', CountVectorizer())
#                             ])),
#                     ('typd',Pipeline([
#                                 ('typd',typeDependency()),
#                                 ('typd_vect', CountVectorizer())
#                             ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)

Pipeline(steps=[('regexProcess', regExProcesses()), ('features', FeatureUnion(n_jobs=1,
       transformer_list=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=500, max_featur...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [101]:
print(pipeline.score(Xtrain,ytrain))
predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.95986409004
0.644585987261


In [106]:
## Applyin gthis on our SMRT dataset
# we got 83% accuracy on this using the model that we trained using the target dependent features and the data the we labeled

predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.400505369551


In [108]:
# Tryin another one with just the BOW features to get the baseline accuracy and see if there is incerease in accuracy

pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
#                     ('lex',lexiconSent()),
#                     ('punc',punctuations()),
#                     ('emot',emoticons()),
#                     ('pos', Pipeline([
#                                 ('pos',posTagTweets()),
#                                 ('pos_vect', CountVectorizer())
#                             ])),
#                     ('subj',Pipeline([
#                                 ('subj',subjectivityLexicon()),
#                                 ('subj_vect', CountVectorizer())
#                             ])),
#                     ('targ',Pipeline([
#                                 ('targ',enhancedTargetFeats()),
#                                 ('targ_vect', CountVectorizer())
#                             ])),
#                     ('typd',Pipeline([
#                                 ('typd',typeDependency()),
#                                 ('typd_vect', CountVectorizer())
#                             ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)
print(pipeline.score(Xtrain,ytrain))
predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.939052877469
0.617515923567


So the baseline accuracy of this dataset is 61% !

# With the new dataset

In [31]:
data = pd.read_excel('/Users/gautamborgohain/Desktop/CI_Data_Labelled2.xlsx')
data.head()

Unnamed: 0,Created At,DayWeek,DayYear,Hour,ID,In Reply To,Language,Place,ReTweet Count,Sentiment,Tweet,Unnamed: 2,User Handle,isSwarm,POSITVE_LEX,NEGATIVE_LEX,PUNC_EXCL,PUNC_QUES,EMOT_HAPPY,EMOT_SAD,EMOT_WINK,ANGRY_EMO,LOVE_EMO,HAPPY_EMO,NEUTRAL_EMO,SAD_EMO
0,2016-03-23 07:58:09,2,83,7,712428180989042048,,en,,11,0,RT NOT_TARGET TODAY'S TOP STORY: HASH_MRT trai...,,vxnnnn,False,0,2,0,0,0,0,0,0,0,0,0,0
1,2016-03-23 07:55:48,2,83,7,712427591685136000,,en,,0,0,"On Facebook, condolences pour forth for 2 SMRT...",,ThinaeshS,False,0,1,0,0,0,0,0,0,0,0,0,0
2,2016-03-23 07:53:36,2,83,7,712427035713404032,,en,,289,0,RT NOT_TARGET SMRT's fatal accident near HASH_...,,LforLana,False,0,1,0,0,0,0,0,0,0,0,0,0
3,2016-03-23 07:46:04,2,83,7,712425139594328064,,en,,181,0,RT NOT_TARGET JUST IN: SMRT releases 2 photos ...,,vlxhh,False,0,1,0,0,0,0,0,0,0,0,0,0
4,2016-03-23 07:45:34,2,83,7,712425016323778048,,en,,10,0,RT NOT_TARGET HAPPENING TODAY: HASH_LKY’s 1st ...,,adorenew,False,0,2,0,0,0,0,0,0,0,0,0,0


In [32]:
data.Sentiment.value_counts()

 0    1841
-1     733
 1     275
Name: Sentiment, dtype: int64

In [21]:
data = pd.read_excel('/Users/gautamborgohain/Desktop/CI_Data_Labelled2.xlsx')
data.index = np.arange(len(data))

perc = 0.6
training_set = data.sample(frac = perc, random_state=0)
testing_set = data.loc[~data.index.isin(training_set.index)]
ytrain = training_set.Sentiment
Xtrain = training_set.Tweet
print(len(Xtrain))
Xtest = testing_set.Tweet
ytest = testing_set.Sentiment
print(len(Xtest))

1709
1140


In [23]:
#The baseline score using only the BOW
pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
#                     ('lex',lexiconSent()),
#                     ('punc',punctuations()),
#                     ('emot',emoticons()),
#                     ('pos', Pipeline([
#                                 ('pos',posTagTweets()),
#                                 ('pos_vect', CountVectorizer())
#                             ])),
#                     ('subj',Pipeline([
#                                 ('subj',subjectivityLexicon()),
#                                 ('subj_vect', CountVectorizer())
#                             ])),
#                     ('targ',Pipeline([
#                                 ('targ',enhancedTargetFeats()),
#                                 ('targ_vect', CountVectorizer())
#                             ])),
#                     ('typd',Pipeline([
#                                 ('typd',typeDependency()),
#                                 ('typd_vect', CountVectorizer())
#                             ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)

print(pipeline.score(Xtrain,ytrain))
predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.953188999415
0.744736842105


In [22]:
#The baseline score using no target dependent features
pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
                    ('lex',lexiconSent()),
                    ('punc',punctuations()),
                    ('emot',emoticons()),
#                     ('pos', Pipeline([
#                                 ('pos',posTagTweets()),
#                                 ('pos_vect', CountVectorizer())
#                             ])),
#                     ('subj',Pipeline([
#                                 ('subj',subjectivityLexicon()),
#                                 ('subj_vect', CountVectorizer())
#                             ])),
#                     ('targ',Pipeline([
#                                 ('targ',enhancedTargetFeats()),
#                                 ('targ_vect', CountVectorizer())
#                             ])),
#                     ('typd',Pipeline([
#                                 ('typd',typeDependency()),
#                                 ('typd_vect', CountVectorizer())
#                             ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)

print(pipeline.score(Xtrain,ytrain))
predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.959040374488
0.765789473684


The performance does increase as we add the different chuncks of features into the model.However we need to do a T-test to see if it the idfference is statistically significant.

In [24]:
pipeline = Pipeline([
    ('regexProcess', regExProcesses()),
    ('features', FeatureUnion([
                    ('bow', CountVectorizer(min_df=4,max_df = 500,stop_words='english',lowercase=True,ngram_range=(1,4))),
                    ('lex',lexiconSent()),
                    ('punc',punctuations()),
                    ('emot',emoticons()),
                    ('pos', Pipeline([
                                ('pos',posTagTweets()),
                                ('pos_vect', CountVectorizer())
                            ])),
                    ('subj',Pipeline([
                                ('subj',subjectivityLexicon()),
                                ('subj_vect', CountVectorizer())
                            ])),
                    ('targ',Pipeline([
                                ('targ',enhancedTargetFeats()),
                                ('targ_vect', CountVectorizer())
                            ])),
                    ('typd',Pipeline([
                                ('typd',typeDependency()),
                                ('typd_vect', CountVectorizer())
                            ]))
                ])
    ),
    ('svm', LinearSVC())    
    ])

# clf = pipeline.fit(Xtemp,Ytemp)

pipeline.fit(Xtrain,ytrain)

print(pipeline.score(Xtrain,ytrain))
predictions  = pipeline.predict(Xtest)
print(accuracy_score(predictions, ytest)) 

0.99122293739
0.771929824561


In [25]:
joblib.dump(pipeline,'TwitterSentiment_NEW_SMRT_model_All_60.pkl')

['TwitterSentiment_NEW_SMRT_model_All_60.pkl',
 'TwitterSentiment_NEW_SMRT_model_All_60.pkl_01.npy',
 'TwitterSentiment_NEW_SMRT_model_All_60.pkl_02.npy',
 'TwitterSentiment_NEW_SMRT_model_All_60.pkl_03.npy']

### Now apply this model to the SEM Eval dataset. fingers crossed


In [29]:
X = cleaned.Tweet
y = cleaned.Sentiment

predictions  = pipeline.predict(X)
print(accuracy_score(predictions, y)) 

ValueError: Can't handle mix of multiclass and unknown

In [30]:
y_native = [int(pred) for pred in y]
print(accuracy_score(predictions, y_native)) 

0.421327557651


Again very low. But this is expected because we are testing on a different dataset of different domain altogether