# Gillette Tweet Modelling 

In [1]:
#Import libraries to start your analysis

import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string 
from nltk.stem import WordNetLemmatizer

In [2]:
#Import the gillette tweets for our analysis, use this encoding to avoid encoding errors
#Save it to a dataframe 

df1 = pd.read_csv("gilettesent4.csv", encoding='ISO-8859-1')

In [3]:
#Call head to see the first six records

df1.head()

Unnamed: 0,Tweets,Label
0,Barbasol Once Showed Gillette How To Make a Co...,Negative
1,BoycottGillette,Negative
2,MeetBarbasol arbasol Once Showed Gillette How ...,Negative
3,BoycottGillette MeetBarbasol destinyisbright C...,Negative
4,MeetBarbasol MeetBarbasol Barbasol Once Showed...,Negative


In [4]:
#Make two lists one with the tweets and one with the sentiment label

Tweet = []
Labels = []

for row in df1["Tweets"]: #Take the first column Tweets to clean it more
    #tokenize words
    words = word_tokenize(row) #Tokenize it or break it down into rows
    #remove punctuations
    clean_words = [word.lower() for word in words if word not in set(string.punctuation)] #Take out any string punctuation
    english_stops = set(stopwords.words('english')) #remove stop words
    characters_to_remove = ["''",'``',"rt","https","’","“","”","\u200b","--","n't","'s","...","//t.c","'re" ,"'m"] #remove other
    #characters that may intefere
    clean_words = [word for word in clean_words if word not in english_stops] #take out the stop words
    clean_words = [word for word in clean_words if word not in set(characters_to_remove)] #take out characters for clean words
    wordnet_lemmatizer = WordNetLemmatizer() #get the lemmas which breaks down the word but still keeps the semantic meaning
    lemma_list = [wordnet_lemmatizer.lemmatize(word) for word in clean_words] #create a list of those lemmas
    Tweet.append(lemma_list) #append them into the new Tweet list

    for row in df1["Label"]:
        Labels.append(row) #Get a separate list for the label 

In [5]:
combined = zip(Tweet, Labels) #Zip both lists togther after the cleaning

In [6]:
def bag_of_words(words): #create a bag of words function 
    return dict([(word, True) for word in words]) #This will create a new dictionary with key value pairs of the tweet and label

#The bag-of-words model is a simplifying representation used in natural language processing and information retrieval (IR). 
#In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding 
#grammar and even word order but keeping multiplicity. This will help with our model

In [7]:
Final_Data = [] #Create a new list

for r, v in combined:
    bag_of_words(r)
    Final_Data.append((bag_of_words(r),v)) #make the dictionary keys from the tweets and sentiment and put it in new list

In [8]:
#Randomize the data that will be used for the model
import random
random.shuffle(Final_Data)
print(len(Final_Data)) #This is how many records we have in our dataset

265


### INFO on Naive Bayes PRE-MODEL

The Naive Bayes algorithm is an intuitive method that uses the probabilities of each attribute belonging to each class to make a prediction. It is the supervised learning approach you would come up with if you wanted to model a predictive modeling problem probabilistically.

Naive bayes simplifies the calculation of probabilities by assuming that the probability of each attribute belonging to a given class value is independent of all other attributes. This is a strong assumption but results in a fast and effective method.

The probability of a class value given a value of an attribute is called the conditional probability. By multiplying the conditional probabilities together for each attribute for a given class value, we have a probability of a data instance belonging to that class.

To make a prediction we can calculate probabilities of the instance belonging to each class and select the class value with the highest probability.

Naive bases is often described using categorical data because it is easy to describe and calculate using ratios. A more useful version of the algorithm for our purposes supports numeric attributes and assumes the values of each numerical attribute are normally distributed (fall somewhere on a bell curve). Again, this is a strong assumption, but still gives robust results.

In [9]:
train_set, test_set = Final_Data[0:188], Final_Data[188:] #As a standard, split the data 30/70 of the 265 for the train and 
#test set. The train set is the portion of data that we are using to train our model, and the test set is where after the 
#the model has been trained, we will test its predictions on the test set and see how accurate it is in predicting what 
#was the sentiment of the test set from the sentiment given in the train set. Essentially, how accurate is our model in
#predicting sentiment of tweets? 

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics

refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set) #This is our Naive Bayes classifier
 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print("Naive Bayes Performance with Unigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))

Naive Bayes Performance with Unigrams 
Accuracy: 0.7402597402597403


### INFO on Confusion Matrix (which is after this cell)

Compute confusion matrix to evaluate the accuracy of a classification

By definition a confusion matrix  is such that  is equal to the number of observations known to be in group  but predicted to be in group.

Thus in binary classification, the count of true brand damaging negative (how many tweets that were negative the model predicted correctly), false brand damaging negatives - recall (how many tweets that were labeled as negative were positive), true brand positives (how many tweets that were positive were predicted correctly), and false brand positive positives - precision (how many tweets that were labeled as positive were actually negative). 

The F measure (F1 score or F score) is a measure of a test's accuracy and is defined as the weighted harmonic mean of the precision and recall of the test.

In [12]:
print("UnigramNB Results")
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

UnigramNB Results
Brand Positive Precision: 0.5853658536585366
Brand Positive Recall: 0.8888888888888888
Brand Positive F-measure: 0.7058823529411764
Brand Damaging Precision: 0.9166666666666666
Brand Damaging Recall: 0.9166666666666666
Brand Damaging F-measure: 0.7674418604651162



In [13]:
#Top features from the model based on the tweets
classifier.show_most_informative_features(n=50)

Most Informative Features
                    away = True           Positi : Negati =      5.2 : 1.0
                    take = True           Positi : Negati =      4.4 : 1.0
       thebestamancanget = True           Positi : Negati =      4.4 : 1.0
                    real = True           Negati : Positi =      4.1 : 1.0
                   itâs = True           Positi : Negati =      3.6 : 1.0
                   point = True           Negati : Positi =      3.0 : 1.0
                  people = True           Positi : Negati =      2.9 : 1.0
                      'i = True           Positi : Negati =      2.8 : 1.0
                     son = True           Positi : Negati =      2.8 : 1.0
                 outrage = True           Positi : Negati =      2.8 : 1.0
                      pa = True           Positi : Negati =      2.8 : 1.0
                    look = True           Positi : Negati =      2.8 : 1.0
                     day = True           Positi : Negati =      2.8 : 1.0

### INFO on Decision Tree Algorithm PRE-MODEL

A decision tree is a flowchart-like tree structure where an internal node represents feature(or attribute), the branch represents a decision rule, and each leaf node represents the outcome. The topmost node in a decision tree is known as the root node. It learns to partition on the basis of the attribute value. It partitions the tree in recursively manner call recursive partitioning. This flowchart-like structure helps you in decision making. It's visualization like a flowchart diagram which easily mimics the human level thinking. That is why decision trees are easy to understand and interpret

The basic idea behind any decision tree algorithm is as follows:

Select the best attribute using Attribute Selection Measures(ASM) to split the records.
Make that attribute a decision node and breaks the dataset into smaller subsets.
Starts tree building by repeating this process recursively for each child until one of the condition will match:
All the tuples belong to the same attribute value.
There are no more remaining attributes.
There are no more instances.

In [14]:
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set,  #making the necessary cutoffs to prune the tree
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats) #Start the model with the classifier
    testset[observed].add(i)

print("UnigramDT Results")
print("Accuracy:",nltk.classify.accuracy(dt_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

UnigramDT Results
Accuracy: 0.6883116883116883
Brand Positive Precision: 0.5853658536585366
Brand Positive Recall: 0.8888888888888888
Brand Positive F-measure: 0.7058823529411764
Brand Damaging Precision: 0.9166666666666666
Brand Damaging Recall: 0.7407407407407407
Brand Damaging F-measure: 0.7674418604651162



### INFO on Logistic Regression PRE-MODEL

Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

Logistic Regression Assumptions
Binary logistic regression requires the dependent variable to be binary.
For a binary regression, the factor level 1 of the dependent variable should represent the desired outcome.
Only the meaningful variables should be included.
The independent variables should be independent of each other. That is, the model should have little or no multicollinearity.
The independent variables are linearly related to the log odds.
Logistic regression requires quite large sample sizes.
Keeping the above assumptions in mind, let’s look at our dataset.

In [15]:
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
    
print("UnigramsLogit Results")
print("Accuracy:",nltk.classify.accuracy(logit_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")
 

UnigramsLogit Results
Accuracy: 0.7142857142857143
Brand Positive Precision: 0.5853658536585366
Brand Positive Recall: 0.8888888888888888
Brand Positive F-measure: 0.7058823529411764
Brand Damaging Precision: 0.9166666666666666
Brand Damaging Recall: 0.7540983606557377
Brand Damaging F-measure: 0.7674418604651162



### INFO on Support Vector Machine

Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.

The advantages of support vector machines are:

Effective in high dimensional spaces.
Still effective in cases where number of dimensions is greater than the number of samples.
Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.
The disadvantages of support vector machines include:

If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [18]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("UniigramSVM Recall")
print("Accuracy:",nltk.classify.accuracy(SVM_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))

UniigramSVM Recall
Accuracy: 0.6493506493506493
Brand Positive Precision: 0.5853658536585366
Brand Positive Recall: 0.8888888888888888
Brand Positive F-measure: 0.7058823529411764
Brand Damaging Precision: 0.9166666666666666
Brand Damaging Recall: 0.9166666666666666
Brand Damaging F-measure: 0.7674418604651162


## Bigrams!!

A bigram or digram is a sequence of two adjacent elements from a string of tokens, which are typically letters, syllables, or words. A bigram is an n-gram for n=2

Here is our sentence "I read a book about the history of America."
 The machine wants to get the meaning of the sentence by separating it into small pieces. How should it do that? 
1. It can regard words one by one. This is unigram; each word is a gram.
 "I", "read", "a", "book", "about", "the", "history", "of", "America"
2. It can regard words two at a time. This is bigram (digram); each two adjacent words create a bigram.
"I read", "read a", "a book", "book about", "about the", "the history", "history of", "of America"
3. It can regard words three at a time. This is trigram; each three adjacent words create a trigram.
"I read a", "read a book", "a book about", "book about the", "about the history", "the history of", "history of America"

In [19]:
from nltk import bigrams, trigrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [20]:
combined = zip(Tweet,Labels)

In [21]:
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)  
    bigrams = bigram_finder.nbest(score_fn, n)  
    return bag_of_words(bigrams) #Create the bigrams

In [22]:
Final_Data2 =[]

for z, e in combined:
    bag_of_bigrams_words(z)
    Final_Data2.append((bag_of_bigrams_words(z),e))

In [23]:
import random
random.shuffle(Final_Data2)
print(len(Final_Data2))

train_set, test_set = Final_Data2[0:218], Final_Data2[218:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics



refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)


print("Naive Bayes Performance with Unigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))

265
Naive Bayes Performance with Unigrams 
Accuracy: 0.7659574468085106


In [24]:
classifier.show_most_informative_features(n=20)

Most Informative Features
      ('gillette', 'ad') = True           Positi : Negati =      4.9 : 1.0
('gillette', 'commercial') = True           Positi : Negati =      4.3 : 1.0
('boycotting', 'gillette') = True           Positi : Negati =      3.8 : 1.0
           ('ad', 'men') = True           Positi : Negati =      3.8 : 1.0
 ('threatened', 'razor') = True           Positi : Negati =      3.0 : 1.0
         ('real', 'men') = True           Negati : Positi =      2.9 : 1.0
     ('thing', 'offend') = True           Positi : Negati =      2.1 : 1.0
    ('razor', 'company') = True           Positi : Negati =      2.1 : 1.0
     ('buy', 'gillette') = True           Positi : Negati =      2.1 : 1.0
('commercial', 'iâ\x92ve') = True           Positi : Negati =      2.1 : 1.0
 ('razor', 'commerical') = True           Positi : Negati =      2.1 : 1.0
       ('hint', 'razor') = True           Positi : Negati =      2.1 : 1.0
   ('boy', 'gillettead') = True           Positi : Negati =      2.1

In [25]:
print("BigramDT Results")
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

BigramDT Results
Brand Positive Precision: 0.7272727272727273
Brand Positive Recall: 0.5
Brand Positive F-measure: 0.5925925925925926
Brand Damaging Precision: 0.7777777777777778
Brand Damaging Recall: 0.7777777777777778
Brand Damaging F-measure: 0.835820895522388



In [31]:
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = dt_classifier.classify(feats)
    testsets[observed].add(i)
    
print("BigramDT Results")
print("Accuracy:",nltk.classify.accuracy(dt_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

BigramDT Results
Accuracy: 0.723404255319149
Brand Positive Precision: 0.8
Brand Positive Recall: 0.25
Brand Positive F-measure: 0.38095238095238093
Brand Damaging Precision: 0.7142857142857143
Brand Damaging Recall: 0.7142857142857143
Brand Damaging F-measure: 0.8219178082191781



In [32]:
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
    
print("BigramsLogit Results")
print("Accuracy:",nltk.classify.accuracy(logit_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")
 

BigramsLogit Results
Accuracy: 0.6808510638297872
Brand Positive Precision: 0.8
Brand Positive Recall: 0.25
Brand Positive F-measure: 0.38095238095238093
Brand Damaging Precision: 0.7142857142857143
Brand Damaging Recall: 0.7142857142857143
Brand Damaging F-measure: 0.8219178082191781



In [34]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("Bigrams Recall")
print("Accuracy:",nltk.classify.accuracy(SVM_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))

Bigrams Recall
Accuracy: 0.6595744680851063
Brand Positive Precision: 0.8
Brand Positive Recall: 0.25
Brand Positive F-measure: 0.38095238095238093
Brand Damaging Precision: 0.7142857142857143
Brand Damaging Recall: 0.7142857142857143
Brand Damaging F-measure: 0.8219178082191781


### Trigrams!!

Trigrams are a special case of the n-gram, where n is 3. They are often used in natural language processing for performing statistical analysis of texts and in cryptography for control and use of ciphers and codes.

Here is our sentence "I read a book about the history of America."
 The machine wants to get the meaning of the sentence by separating it into small pieces. How should it do that? 
1. It can regard words one by one. This is unigram; each word is a gram.
 "I", "read", "a", "book", "about", "the", "history", "of", "America"
2. It can regard words two at a time. This is bigram (digram); each two adjacent words create a bigram.
"I read", "read a", "a book", "book about", "about the", "the history", "history of", "of America"
3. It can regard words three at a time. This is trigram; each three adjacent words create a trigram.
"I read a", "read a book", "a book about", "book about the", "about the history", "the history of", "history of America"


In [35]:
combined = zip(Tweet,Labels)

In [36]:
from nltk import bigrams, trigrams
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

def bag_of_trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq, n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)  
    trigrams = trigram_finder.nbest(score_fn, n)  
    return bag_of_words(trigrams)

In [37]:
Final_Data3 =[]

for z, e in combined:
    bag_of_trigrams_words(z)
    Final_Data3.append((bag_of_trigrams_words(z),e))

import random
random.shuffle(Final_Data3)
print(len(Final_Data3))

train_set, test_set = Final_Data3[0:218], Final_Data3[218:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics


refsets = collections. defaultdict(set)
testsets = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)


print("Naive Bayes Performance with Trigrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))


265
Naive Bayes Performance with Trigrams 
Accuracy: 0.7659574468085106


In [39]:
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))

Brand Positive Precision: 1.0
Brand Positive Recall: 0.3888888888888889
Brand Positive F-measure: 0.56
Brand Damaging Precision: 0.725
Brand Damaging Recall: 0.6595744680851063
Brand Damaging F-measure: 0.8405797101449275


In [40]:
classifier.show_most_informative_features(n=10)

Most Informative Features
('gillette', 'ad', 'men') = True           Positi : Negati =      3.9 : 1.0
('hint', 'razor', 'commerical') = True           Positi : Negati =      1.3 : 1.0
 ('offend', 'u', 'hint') = True           Positi : Negati =      1.3 : 1.0
('truly', 'appreciate', 'men') = True           Positi : Negati =      1.3 : 1.0
('thing', 'offend', 'u') = True           Positi : Negati =      1.3 : 1.0
  ('u', 'hint', 'razor') = True           Positi : Negati =      1.3 : 1.0
('company', 'truly', 'appreciate') = True           Positi : Negati =      1.3 : 1.0
('commercial', 'asking', 'better') = True           Negati : Positi =      1.3 : 1.0
('new', 'shaver', 'need') = True           Negati : Positi =      1.3 : 1.0
('shaver', 'need', 'new') = True           Negati : Positi =      1.3 : 1.0


In [43]:
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = dt_classifier.classify(feats)
    testsets[observed].add(i)
    
print("TrigramDT Results")
print("Accuracy:",nltk.classify.accuracy(dt_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

TrigramDT Results
Accuracy: 0.6170212765957447
Brand Positive Precision: None
Brand Positive Recall: 0.0
Brand Positive F-measure: None
Brand Damaging Precision: 0.6170212765957447
Brand Damaging Recall: 0.6170212765957447
Brand Damaging F-measure: 0.7631578947368421



In [46]:
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = logit_classifier.classify(feats)
    testsets[observed].add(i)
    
print("TrigramsLogit Results")
print("Accuracy:",nltk.classify.accuracy(logit_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))
print("")

TrigramsLogit Results
Accuracy: 0.5106382978723404
Brand Positive Precision: 0.4358974358974359
Brand Positive Recall: 0.9444444444444444
Brand Positive F-measure: 0.5964912280701755
Brand Damaging Precision: 0.6170212765957447
Brand Damaging Recall: 0.6170212765957447
Brand Damaging F-measure: 0.7631578947368421



In [47]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("Trigrams Results")
print("Accuracy:",nltk.classify.accuracy(SVM_classifier, test_set))
print('Brand Positive Precision:', precision(refsets['Positive'], testsets['Positive']))
print('Brand Positive Recall:', recall(refsets['Positive'], testsets['Positive']))
print('Brand Positive F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print('Brand Damaging Precision:', precision(refsets['Negative'], testsets['Negative']))
print('Brand Damaging Recall:', recall(testsets['Negative'], refsets['Negative']))
print('Brand Damaging F-measure:', f_measure(refsets['Negative'], testsets['Negative']))

Trigrams Results
Accuracy: 0.6170212765957447
Brand Positive Precision: 0.4358974358974359
Brand Positive Recall: 0.9444444444444444
Brand Positive F-measure: 0.5964912280701755
Brand Damaging Precision: 0.6170212765957447
Brand Damaging Recall: 0.6170212765957447
Brand Damaging F-measure: 0.7631578947368421


### N-grams!!! (combining all the grams!)

N-grams are contiguous sequences of n-items in a sentence. N can be 1, 2 or any other positive integers, although usually we do not consider very large N because those n-grams rarely appears in many different places.

When performing machine learning tasks related to natural language processing, we usually need to generate n-grams from input sentences. For example, in text classification tasks, in addition to using each individual token found in the corpus, we may want to add bi-grams or tri-grams as features to represent our documents. This post describes several different ways to generate n-grams quickly from input sentences in Python.

In [48]:
combined = zip(Tweet,Labels)

In [49]:
def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq,
n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams

from nltk.collocations import TrigramCollocationFinder

# Import Bigram metrics - we will use these to identify the top 200 bigrams
from nltk.metrics import TrigramAssocMeasures

def trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq,
n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    return trigrams


def bag_of_Ngrams_words(words):
    bigramBag = bigrams_words(words)
    
    #The following two for loops convert tuple into string
    for b in range(0,len(bigramBag)):
        bigramBag[b]=' '.join(bigramBag[b])
   
    trigramBag = trigrams_words(words)
    for t in range(0,len(trigramBag)):
        trigramBag[t]=' '.join(trigramBag[t])

    return bag_of_words(trigramBag + bigramBag + words)


In [50]:
Final_Data4 =[]

for z, e in combined:
    bag_of_Ngrams_words(z)
    Final_Data4.append((bag_of_Ngrams_words(z),e))

In [56]:
import random
random.shuffle(Final_Data4)
print(len(Final_Data4))

train_set, test_set = Final_Data4[0:218], Final_Data4[218:]

import nltk
import collections
from nltk.metrics.scores import (accuracy, precision, recall, f_measure) 
from nltk import metrics


refset = collections. defaultdict(set)
testset = collections.defaultdict(set)

classifier = nltk.NaiveBayesClassifier.train(train_set)

 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = classifier.classify(feats)
    testset[observed].add(i)


print("Naive Bayes Performance with Ngrams ")    
print("Accuracy:",nltk.classify.accuracy(classifier, test_set))


265
Naive Bayes Performance with Ngrams 
Accuracy: 0.6382978723404256


In [57]:
classifier.show_most_informative_features(n=20)

Most Informative Features
             gillette ad = True           Positi : Negati =      7.0 : 1.0
                    away = True           Positi : Negati =      5.7 : 1.0
                    love = True           Positi : Negati =      4.5 : 1.0
     gillette commercial = True           Positi : Negati =      4.5 : 1.0
                    tell = True           Positi : Negati =      3.9 : 1.0
                     son = True           Positi : Negati =      3.9 : 1.0
                      pa = True           Positi : Negati =      3.9 : 1.0
                    iâm = True           Positi : Negati =      3.9 : 1.0
                    take = True           Positi : Negati =      3.9 : 1.0
                     guy = True           Positi : Negati =      3.9 : 1.0
                   right = True           Positi : Negati =      3.9 : 1.0
         gilletteboycott = True           Negati : Positi =      3.8 : 1.0
                    time = True           Positi : Negati =      3.6 : 1.0

In [59]:
print('Brand Positive Precision:', precision(refset['Positive'], testset['Positive']))
print('Brand Positive Recall:', recall(refset['Positive'], testset['Positive']))
print('Brand Positive F-measure:', f_measure(refset['Positive'], testset['Positive']))
print('Brand Damaging Precision:', precision(refset['Negative'], testset['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refset['Negative'], testset['Negative']))

Brand Positive Precision: 0.52
Brand Positive Recall: 0.7222222222222222
Brand Positive F-measure: 0.6046511627906977
Brand Damaging Precision: 0.7727272727272727
Brand Damaging Recall: 0.7727272727272727
Brand Damaging F-measure: 0.6666666666666666


In [60]:
from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
    
print("NgramDT Results")
print("Accuracy:",nltk.classify.accuracy(dt_classifier, test_set))
print('Brand Positive Precision:', precision(refset['Positive'], testset['Positive']))
print('Brand Positive Recall:', recall(refset['Positive'], testset['Positive']))
print('Brand Positive F-measure:', f_measure(refset['Positive'], testset['Positive']))
print('Brand Damaging Precision:', precision(refset['Negative'], testset['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refset['Negative'], testset['Negative']))
print("")

NgramDT Results
Accuracy: 0.6170212765957447
Brand Positive Precision: 0.5
Brand Positive Recall: 0.2777777777777778
Brand Positive F-measure: 0.35714285714285715
Brand Damaging Precision: 0.6486486486486487
Brand Damaging Recall: 0.6486486486486487
Brand Damaging F-measure: 0.7272727272727273



In [61]:
from nltk.classify import MaxentClassifier

logit_classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
    
print("NgramsLogit Recall")
print("Accuracy:",nltk.classify.accuracy(logit_classifier, test_set))
print('Brand Positive Precision:', precision(refset['Positive'], testset['Positive']))
print('Brand Positive Recall:', recall(refset['Positive'], testset['Positive']))
print('Brand Positive F-measure:', f_measure(refset['Positive'], testset['Positive']))
print('Brand Damaging Precision:', precision(refset['Negative'], testset['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refset['Negative'], testset['Negative']))
print("")

NgramsLogit Recall
Accuracy: 0.7021276595744681
Brand Positive Precision: 0.5714285714285714
Brand Positive Recall: 0.6666666666666666
Brand Positive F-measure: 0.6153846153846154
Brand Damaging Precision: 0.65
Brand Damaging Recall: 0.65
Brand Damaging F-measure: 0.7536231884057971



In [62]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("NgramsSVM Recall")
print("Accuracy:",nltk.classify.accuracy(SVM_classifier, test_set))
print('Brand Positive Precision:', precision(refset['Positive'], testset['Positive']))
print('Brand Positive Recall:', recall(refset['Positive'], testset['Positive']))
print('Brand Positive F-measure:', f_measure(refset['Positive'], testset['Positive']))
print('Brand Damaging Precision:', precision(refset['Negative'], testset['Negative']))
print('Brand Damaging Recall:', recall(testset['Negative'], refset['Negative']))
print('Brand Damaging F-measure:', f_measure(refset['Negative'], testset['Negative']))
print("")

NgramsSVM Recall
Accuracy: 0.6170212765957447
Brand Positive Precision: 0.5714285714285714
Brand Positive Recall: 0.6666666666666666
Brand Positive F-measure: 0.6153846153846154
Brand Damaging Precision: 0.6170212765957447
Brand Damaging Recall: 0.6170212765957447
Brand Damaging F-measure: 0.7631578947368421

