#Model Building Using Text Analysis

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
critics = pd.read_csv('comments1_with_sentiments.csv')
#let's drop rows with missing quotes
critics['positive'] = critics['Answer1'] >= 0
critics.head()

Unnamed: 0,HITID,index,body,company,date,score,Worker1,Answer1,Avg,Date,positive
0,3L21G7IH4773YI2L6RV6T5G4RTC1YW,1199,"I shall do that! I was thinking ""I've had this...",Fitbit,1449270750,3,A2R0YYUAWNT7UD,2,2,2015-12-05 06:58:53 UTC,True
1,3I7KR83SNAOQ3IGZ6P98Z3JCRQR9KI,1178,Love it. Has helped keep me on track for weigh...,Fitbit,1449265917,2,A1NM7ZPZ3NH412,2,2,2015-12-05 06:59:05 UTC,True
2,3P7RGTLO6EO481Q4YVN8VYUWX89KA8,1174,I love it because I do not have to stop and pu...,Fitbit,1449251995,2,A3ITZNJQUTIZ4C,2,2,2015-12-05 07:02:17 UTC,True
3,373L46LKP7HF9UT8S10LOXXFNV3KJO,409,"I never wore a watch, now I wear an Apple Watc...",Apple,1449290608,1,A2R0YYUAWNT7UD,2,2,2015-12-05 06:59:32 UTC,True
4,3G9UA71JVV5REFMO97BCKSSTHB2J7G,1465,"The fallout soundtracks are great, but IMO the...",Spotify,1449249457,1,A1FP3SH704X01V,2,2,2015-12-05 06:59:41 UTC,True


In [15]:
grp = critics.groupby('company')
avg = grp.Answer1.sum()

In [16]:
avg

company
Airbnb        -3
Amazon       -12
Apple        -39
Asana          0
Buzzfeed       1
Coursera      -1
DoorDash       0
Dropbox        2
Fitbit        -8
Google       -10
Hubspot        0
Instagram     18
Jawbone       -1
Kayak          0
Laserfiche    -1
LinkedIn      -1
Lyft         -25
Microsoft     -2
Pinterest      1
Quora          0
Riot_Games    -3
Snapchat      11
Spotify        1
Tinder       -20
Twitter        1
Uber           5
Vivint         0
YikYak         0
Name: Answer1, dtype: int64

#Make X and Y

In [110]:
def make_xy(critics, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer(encoding = 'utf-8', strip_accents = 'ascii', stop_words='english')
    X = vectorizer.fit_transform(critics.body)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (critics.Answer1).values.astype(np.int)
    return X, y
def make_xyBinary(critics, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer(encoding = 'utf-8', strip_accents = 'ascii', stop_words='english')
    X = vectorizer.fit_transform(critics.body)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (critics.positive).values.astype(np.int)
    return X, y
X, y = make_xy(critics)
Xbin, ybin = make_xyBinary(critics)

In [111]:
make_xy(critics)

(<1846x5952 sparse matrix of type '<type 'numpy.int64'>'
 	with 22897 stored elements in Compressed Sparse Column format>,
 array([ 2,  2,  2, ..., -1, -1, -1]))

In [122]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y)
clf = MultinomialNB().fit(xtrain, ytrain)
print "MN Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest))

another = RandomForestClassifier(n_estimators=100, min_samples_split=2, n_jobs=-1).fit(xtrain,ytrain)
print "RF Accuracy: %.02f%%" % (100*another.score(xtest,ytest))



MN Accuracy: 54.98%
RF Accuracy: 56.28%


In [123]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(penalty="l1").fit(xtrain, ytrain)
print "Logistic Accuracy: %.02f%%" % (100*logistic.score(xtest,ytest))

Logistic Accuracy: 58.23%


In [124]:
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

rf_train_accuracy = another.score(xtrain, ytrain)
rf_test_accuracy = another.score(xtest, ytest)

lr_train_accuracy = logistic.score(xtrain,ytrain)
lr_test_accuracy = logistic.score(xtest,ytest)

print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)

print "Accuracy on training data: %0.2f" % (rf_train_accuracy)
print "Accuracy on test %0.2f" % (rf_test_accuracy)

print "Accuracy on training data: %0.2f" % (lr_train_accuracy)
print "Accuracy on test %0.2f" % (lr_test_accuracy)

Accuracy on training data: 0.88
Accuracy on test data:     0.55
Accuracy on training data: 1.00
Accuracy on test 0.56
Accuracy on training data: 0.81
Accuracy on test 0.58


#Validation

In [121]:
from sklearn.cross_validation import KFold
def cv_score(clf, X, y, scorefunc):
    result = 0.
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(X[train], y[train]) # fit
        result += scorefunc(clf, X[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [106]:
def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    rotten = y < 0
    fresh = ~rotten
    return prob[rotten, 0].sum() + prob[fresh, 1].sum()

In [107]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(critics.shape[0]), train_size=0.7)
mask=np.ones(critics.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

#Naive Bayes

In [136]:
alphas = [0, .1, 1, 5, 10, 50]
min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
best_min_df = None
maxscore=-np.inf
for alpha in alphas:
    for min_df in min_dfs:         
        vectorizer = CountVectorizer(min_df = min_df)       
        Xthis, ythis = make_xy(critics, vectorizer)
        Xtrainthis=Xthis[mask]
        ytrainthis=ythis[mask]
        #your code here
        clf = MultinomialNB(alpha=alpha)
        cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)

        if cvscore > maxscore:
            maxscore = cvscore
            best_alpha, best_min_df = alpha, min_df

In [137]:
vectorizer = CountVectorizer(min_df=best_min_df)
X, y = make_xy(critics, vectorizer)
xtrain=X[mask]
ytrain=y[mask]
xtest=X[~mask]
ytest=y[~mask]




clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

# Your code here. Print the accuracy on the test and training dataset
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)



print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)




Accuracy on training data: 0.62
Accuracy on test data:     0.59


#Random Forest

In [131]:
n_ests = [10, 20, 50, 100, 150, 200]
min_dfs = [0,.1,.01,.001,.0001,1]
max_depths = [5,10,15,20,25,40,50,70,80,90]

#Find the best value for alpha and min_df, and the best classifier
best_n_est = 10
maxscore=-np.inf
for max_depth in max_depths:
    for n_est in n_ests:  
        for min_df in min_dfs:
            vectorizer = CountVectorizer(min_df = min_df)       
            Xthis, ythis = make_xy(critics, vectorizer)
            Xtrainthis=Xthis[mask]
            ytrainthis=ythis[mask]
            #your code here
            rf = RandomForestClassifier(n_estimators=n_est,max_depth = max_depth, min_samples_split=2, n_jobs=-1)
            cvscore = cv_score(rf, Xtrainthis, ytrainthis, log_likelihood)

            if cvscore > maxscore:
                maxscore = cvscore
                best_n_est, best_min_df, best_max_depth = n_est, min_df, max_depth

In [221]:
n_est = range(best_n_est-5, best_n_est+5)
min_dfs = range(int(best_min_df), int(best_min_df+5.))
max_depths = range(max_depth-5, max_depth+5)

best_n_est = n_est[0]
maxscore=-np.inf
for max_depth in max_depths:
    for n_est in n_ests:  
        for min_df in min_dfs:
            vectorizer = CountVectorizer(min_df = min_df)       
            Xthis, ythis = make_xy(critics, vectorizer)
            Xtrainthis=Xthis[mask]
            ytrainthis=ythis[mask]
            #your code here
            rf = RandomForestClassifier(n_estimators=n_est,max_depth = max_depth, min_samples_split=2, n_jobs=-1)
            cvscore = cv_score(rf, Xtrainthis, ytrainthis, log_likelihood)

            if cvscore > maxscore:
                maxscore = cvscore
                best_n_est, best_min_df, best_max_depth = n_est, min_df, max_depth
                
print best_n_est, best_min_df, best_max_depth

In [135]:
vectorizer = CountVectorizer(min_df=best_min_df)
X, y = make_xy(critics, vectorizer)
xtrain=X[mask]
ytrain=y[mask]
xtest=X[~mask]
ytest=y[~mask]

best_rf = RandomForestClassifier(n_estimators=best_n_est, min_samples_split=2, n_jobs=-1).fit(xtrain,ytrain)
rf_train_accuracy = best_rf.score(xtrain, ytrain)
rf_test_accuracy = best_rf.score(xtest, ytest)

print "Accuracy on training data: %0.2f" % (rf_train_accuracy)
print "Accuracy on test %0.2f" % (rf_test_accuracy)

Accuracy on training data: 0.97
Accuracy on test 0.58


#Random NLTK Sentiment

In [209]:
reddit = []
for row in range(len(critics.body)):
    words = critics.iloc[row].body.split()
    temp = (words, critics.iloc[row].Answer1)
    reddit.append(temp)

    

In [210]:
import nltk
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


In [211]:
word_features = get_word_features(get_words_in_tweets(reddit))

In [212]:

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


In [213]:
training_set = nltk.classify.apply_features(extract_features, reddit)

In [214]:
classifier = nltk.NaiveBayesClassifier.train(training_set)


In [215]:
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
def train(labeled_featuresets, estimator=ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)


In [220]:
tweet = 'customer help'
print classifier.classify(extract_features(tweet.split()))


0


In [206]:
def gen_bow(text):
    words = text.split()
    bow = {}
    for word in words:
        bow[word.lower()] = True
    return bow


def get_labeled_features(samples):
    word_freqs = {}
    for tokens, label in reddit:
        for token in tokens:
            if token not in word_freqs:
                word_freqs[token] = {'pos': 0, 'neg': 0}
            word_freqs[token][label] += 1
    return word_freqs


def get_label_probdist(labeled_features):
    label_fd = FreqDist()
    for item,counts in labeled_features.items():
        for label in ['neg','pos']:
            if counts[label] > 0:
                label_fd.inc(label)
    label_probdist = ELEProbDist(label_fd)
    return label_probdist


def get_feature_probdist(labeled_features):
    feature_freqdist = defaultdict(FreqDist)
    feature_values = defaultdict(set)
    num_samples = len(train_samples) / 2
    for token, counts in labeled_features.items():
        for label in ['neg','pos']:
            feature_freqdist[label, token].inc(True, count=counts[label])
            feature_freqdist[label, token].inc(None, num_samples - counts[label])
            feature_values[token].add(None)
            feature_values[token].add(True)
    for item in feature_freqdist.items():
        print item[0],item[1]
    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
        feature_probdist[label,fname] = probdist
    return feature_probdist

In [207]:
labeled_features = get_labeled_features(reddit)

label_probdist = get_label_probdist(labeled_features)

feature_probdist = get_feature_probdist(labeled_features)

classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

for sample in test_samples:
    print "%s | %s" % (sample, classifier.classify(gen_bow(sample)))

classifier.show_most_informative_features()

KeyError: 2