**Training datasets**

**'9.Sentiment Training 1.csv'** was downloaded at https://www.kaggle.com/kazanova/sentiment140. It contains the following 6 fields:
- target: the polarity of the tweet (0 = negative, 4 = positive)
- ids: The id of the tweet ( 2087)
- date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- flag: The query (lyx). If there is no query, then this value is NO_QUERY.
- user: the user that tweeted (robotickilldozr)
- text: the text of the tweet (Lyx is cool)

This file is over 200MB

**'9.Sentiment Training 2.csv'** is another Tweets sentiment dataset gotten from https://www.kaggle.com/crowdflower/twitter-airline-sentiment
This file is only 4MB

# Testing Unsupervised Nltk Vader sentiment score:

http://www.nltk.org/howto/sentiment.html

***Using the large training file***

In [3]:
def vader_sentiment_score():
    import pandas as pd
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score

#     traning data
    sentiment_data_df = pd.read_csv('../data/9.Sentiment Training 1.csv',encoding='latin-1', names=['target','ids','date','flag','user','text'])
    sentiment_data_df = sentiment_data_df.sample(frac=0.1, random_state=0)
    sentiment_data_df['target']=sentiment_data_df['target'].replace(4, 1)
    X_train, X_test, y_train, y_test = train_test_split(sentiment_data_df['text'], sentiment_data_df['target'], random_state=0)
    
#     vader model
    sid = SentimentIntensityAnalyzer()
  
#     evaluate vader model
    vader_predictions = [sid.polarity_scores(s)['compound'] for s in X_test]
    for i in range (len(vader_predictions)):
        if vader_predictions[i]<0: 
            vader_predictions[i]=0
        else:vader_predictions[i]=1
    return roc_auc_score(y_test, vader_predictions)
vader_sentiment_score()

0.662510525914678

***Using the small training file***

In [4]:
# import data from file
import pandas as pd
df=pd.read_csv('../data/9.Sentiment Training 2.csv', usecols=['text','airline_sentiment'])

# get sentiment using nltk vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
df["vader_sentiment"] =''
sid = SentimentIntensityAnalyzer()
for i in range(0,len(df)):
    ss = sid.polarity_scores(df.text[i])
    if ss['compound']>0:
        df.vader_sentiment[i]="positive"
    elif ss['compound']<0:
        df.vader_sentiment[i]="negative"
    else:
        df.vader_sentiment[i]="neutral"

# Prediction Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(df.airline_sentiment,df.vader_sentiment)

0.5465163934426229

*Does not look like NLTK Vader has performed very good on both files*

# Testing Supervised Sentiment Predict Models

***Using large traing file***

In [3]:
def test_classifiers():
    from sklearn.model_selection import train_test_split
    from sklearn.svm import LinearSVC
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.naive_bayes import MultinomialNB, BernoulliNB
    from sklearn.linear_model import RidgeClassifier
    from sklearn.linear_model import PassiveAggressiveClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import make_pipeline
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score,precision_score,recall_score
    from sklearn.model_selection import cross_validate,KFold
    import datetime
    import time
    import pandas as pd

    classifiers = [MultinomialNB(),BernoulliNB(),LogisticRegression(),LinearSVC(),AdaBoostClassifier(),RidgeClassifier(),PassiveAggressiveClassifier(),Perceptron()]

    #traning data
    sentiment_data_df = pd.read_csv('../data/9.Sentiment Training.csv',encoding='latin-1', names=['target','ids','date','flag','user','text'])
    sentiment_data_df = sentiment_data_df.sample(frac=0.01, random_state=0)
    sentiment_data_df['target']=sentiment_data_df['target'].replace(4, 1)
    X_train, X_test, y_train, y_test = train_test_split(sentiment_data_df['text'], sentiment_data_df['target'], random_state=0)

    data=[]
    for mindf in range(1,3): 
        for ngram in range(1,3):
            for clf in classifiers:

                before = datetime.datetime.now()
                before = before.strftime("%H:%M:%S")
                start = time.time()

                vect = TfidfVectorizer(min_df=mindf,ngram_range=(1,ngram))
                model = make_pipeline(vect,clf)
                model.fit(X_train,y_train)
                labels = model.predict(X_test)
                ac = accuracy_score(y_test,labels)
                kfold = KFold(n_splits=2,shuffle=False,random_state=None)
                results = cross_validate(model,X_train,y_train,cv=kfold)
                crossval_test_score_mean=results['test_score'].mean()
                crossval_train_score_mean=results['train_score'].mean()

                after = datetime.datetime.now()
                after = after.strftime("%H:%M:%S")
                end = time.time()
                hours = int(after[0:2])-int(before[0:2])
                mins = int(after[3:5])-int(before[3:5])
                secs = int(after[6:8])-int(before[6:8])
                time_taken = str(hours)+":"+str(mins)+":"+str(secs)

                data.append([clf,ngram, mindf, ac,crossval_train_score_mean,crossval_test_score_mean, end-start])

    d = pd.DataFrame(data,columns=['Classifier','TfidfVectorizer_ngram','TfidfVectorizer_mindf','Ac','crossval_train_score_mean','crossval_test_score_mean','Time'])
    d['Ac_rank'] = d['Ac'].rank(ascending=False)
    d['Time_rank'] = d['Time'].rank(ascending=False)
    d.to_csv('../data/9.Sentiment predict models scores.csv')
    return d
test_classifiers()



Unnamed: 0,Classifier,TfidfVectorizer_ngram,TfidfVectorizer_mindf,Ac,crossval_train_score_mean,crossval_test_score_mean,Time,Ac_rank,Time_rank
0,"MultinomialNB(alpha=1.0, class_prior=None, fit...",1,1,0.7365,0.929917,0.7235,0.987393,21.0,28.0
1,"BernoulliNB(alpha=1.0, binarize=0.0, class_pri...",1,1,0.74225,0.927,0.727167,1.035883,16.0,22.0
2,"LogisticRegression(C=1.0, class_weight=None, d...",1,1,0.7525,0.878917,0.742,1.026617,7.0,24.0
3,"LinearSVC(C=1.0, class_weight=None, dual=True,...",1,1,0.7465,0.985917,0.73175,1.02866,11.0,23.0
4,"(DecisionTreeClassifier(class_weight=None, cri...",1,1,0.67475,0.698333,0.670417,3.80479,32.0,3.0
5,"RidgeClassifier(alpha=1.0, class_weight=None, ...",1,1,0.74625,0.963333,0.736667,1.173594,12.0,19.0
6,"PassiveAggressiveClassifier(C=1.0, average=Fal...",1,1,0.736,0.992083,0.717333,1.022122,22.0,25.0
7,"Perceptron(alpha=0.0001, class_weight=None, et...",1,1,0.7015,0.967083,0.70025,1.000475,27.0,27.0
8,"MultinomialNB(alpha=1.0, class_prior=None, fit...",2,1,0.75825,0.98875,0.73625,2.41746,1.0,13.0
9,"BernoulliNB(alpha=1.0, binarize=0.0, class_pri...",2,1,0.73275,0.984167,0.715417,2.355972,23.5,14.0
