# File Path Locations

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Path to all the input data
data_path = '/content/drive/MyDrive/IST 664 Project - DL/kagglemoviereviews/corpus/'

# Library

In [3]:
# Stats Packages
import pandas as pd
import numpy as np
import plotly.express as px

# NLP Packages
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from string import punctuation

# Sklearn Model Classifiers
from sklearn.metrics import accuracy_score
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

#Regular Expression
import re

## Downloadable
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

## Steming and Lemmatizer
porter = nltk.PorterStemmer()
word_lemma = nltk.WordNetLemmatizer()

# movie review sentences
from nltk.corpus import sentence_polarity
nltk.download('sentence_polarity')
import random

####   adding Bigram features   ####
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()


###  POS tag counts
# using the default pos tagger in NLTK (the Stanford tagger)
nltk.download('averaged_perceptron_tagger')

my_colorlist = ['#ffc800','#e98f42','#ca8478','#79af96','#48a8a6']
my_colorlist2 = ['#e98f42','#ca8478','#79af96','#48a8a6']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/sentence_polarity.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Reviewing the Data

In [4]:
df_train = pd.read_csv(data_path+'train.tsv', sep='\t')
df_test = pd.read_csv(data_path+'test.tsv', sep='\t')

In [5]:
unique_sentence = len(np.unique(df_train["SentenceId"]))
total_sentence = len(df_train["SentenceId"])

print(unique_sentence)
print(total_sentence)

total_sentence/ unique_sentence

8529
156060


18.297572986282095

In [6]:
df_train['FullSentenceId'] = df_train.sort_values(['PhraseId'], ascending=[True]) \
             .groupby(['SentenceId']) \
             .cumcount() + 1

df_train[:75]


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,FullSentenceId
0,1,1,A series of escapades demonstrating the adage ...,1,1
1,2,1,A series of escapades demonstrating the adage ...,2,2
2,3,1,A series,2,3
3,4,1,A,2,4
4,5,1,series,2,5
...,...,...,...,...,...
70,71,2,introspective and entertaining,3,8
71,72,2,introspective and,3,9
72,73,2,introspective,2,10
73,74,2,and,2,11


In [7]:
parent_df = df_train[df_train["FullSentenceId"]=="1"]

In [8]:
df_test['FullSentenceId'] = df_test.sort_values(['PhraseId'], ascending=[True]) \
             .groupby(['SentenceId']) \
             .cumcount() + 1
df_test[:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,FullSentenceId
0,156061,8545,An intermittently pleasing but mostly routine ...,1
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,3
3,156064,8545,intermittently pleasing but mostly routine effort,4
4,156065,8545,intermittently pleasing but mostly routine,5
5,156066,8545,intermittently pleasing but,6
6,156067,8545,intermittently pleasing,7
7,156068,8545,intermittently,8
8,156069,8545,pleasing,9
9,156070,8545,but,10


In [9]:
imdb_df_hist = df_train.groupby(["SentenceId"]).agg({"PhraseId":len}).reset_index()
imdb_df_box = df_train.groupby(["SentenceId","Sentiment"]).agg({"PhraseId":len}).reset_index()

In [10]:
px.histogram(imdb_df_hist, x = "PhraseId",
             title = "Phrase Histogram",
             labels={"PhraseId":"# Phrases"})

In [11]:
px.box(imdb_df_box, x = "Sentiment", y = "PhraseId", title = "# of Phrases by Sentiment", labels={"PhraseId":"# of Phrases"})

In [12]:
imdb_df_hist["PhraseId"].describe()

count    8529.000000
mean       18.297573
std         9.950209
min         1.000000
25%        11.000000
50%        17.000000
75%        25.000000
max        63.000000
Name: PhraseId, dtype: float64

# Text Processing

## Preprocessing Words

### Ngrams

In [13]:
#Take the training dataset, tokenize it, and reduce the tokens to lowercase.
# file0 = nltk.corpus.gutenberg.fileids( ) [0]
# emmatext = nltk.corpus.gutenberg.raw(file0)


testtext = ''.join(df_train["Phrase"])
testtokens = nltk.word_tokenize(testtext) 
testwords = [w.lower( ) for w in testtokens] 

df_filter = pd.DataFrame(columns=['filter_type','num_words'])


# show some of the words
total_words = len(testwords)

filter_type = 'Filtering Nothing'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)

print(total_words)
print(testwords[ :25])

981478
['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally']


Need to get rid of punctuation and apostrophies

This is our Bag of words feature

In [14]:
ndist = FreqDist(testwords)

# print the top 25 tokens by frequency
df_words = pd.DataFrame(columns=['word','frequency'])
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)
    df2 = pd.DataFrame([[item[0], item[1]/total_words]], columns=['word','frequency'])
    df_words = df_words.append(df2, ignore_index=True)



, 	 0.04279464236590122
the 	 0.04151901519952561
of 	 0.028987914145808667
and 	 0.02846625191802567
a 	 0.028286930527225265
to 	 0.019897542278074495
's 	 0.01599220767047249
in 	 0.012034910614399916
that 	 0.010320149814871041
is 	 0.009058786850036374
it 	 0.008362897589146165
as 	 0.007317535390502895
with 	 0.006610438542687661
for 	 0.006450475711121391
its 	 0.00602356853643179
film 	 0.005148357884741176
an 	 0.004837602065456383
movie 	 0.004518695273862481
this 	 0.004342430497677992
on 	 0.00412235424533204
be 	 0.004100957943020628
but 	 0.004079561640709216
n't 	 0.003939976239915719
you 	 0.003904315736063366
-- 	 0.0034967671206078996


In [15]:
px.bar(df_words, x = "word", y = "frequency", color = "word", color_discrete_sequence = my_colorlist, title = "Top 25 Words Without Filtering",
       labels={
           "word":"Words",
           "frequency":"Frequency"
       })#.update_layout(yaxis={"tickformat":',.0%'})

### Remove Punctuation

In [16]:
punction_list = list(punctuation)
punct_testwords = [testword for testword in testwords if testword not in punction_list]

# show some of the words
total_words = len(punct_testwords)
filter_type = 'Filtering Punctuation'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)

print(total_words)
print(punct_testwords[ :25])

930730
['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses']


In [17]:
ndist = FreqDist(punct_testwords)

# print the top 30 tokens by frequency
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

the 	 0.043782837127845885
of 	 0.03056847850611885
and 	 0.03001837267521193
a 	 0.029829273795837677
to 	 0.020982454632385333
's 	 0.016864181878740343
in 	 0.012691113427094862
that 	 0.010882855393078551
is 	 0.009552716684752828
it 	 0.008818884101726602
as 	 0.007716523589010776
with 	 0.006970872326023659
for 	 0.006802187530218216
its 	 0.006352003266253371
film 	 0.005429071803852889
an 	 0.005101372041300914
movie 	 0.004765076875141019
this 	 0.004579201272119734
on 	 0.004347125374705876
be 	 0.004324562440235084
but 	 0.004301999505764292
n't 	 0.0041548032189786515
you 	 0.004117198328193998
-- 	 0.0036874281477979652
by 	 0.0035778367517969767


### Remove Stopwords

### Filtering Regex

In [18]:
#Now we want to remove non-alphabetical
#   lower-case characters [^a-z]+
# the beginning ^ and ending $ require the match to begin and end on a word boundary 
pattern = re.compile('^[^a-z]+$')

In [19]:
#function that takes a word and returns true if it consists only
#of non-alphabetic characters  (assumes import re)
def alpha_filter(w):
  #pattern to match word of non-alphabetical characters
  pattern = re.compile('^[^a-z]+$')
  if (pattern.match(w)):
    return True
  else:
    return False

In [20]:
#apply the above function to testwords
alphatestwords = [w for w in punct_testwords if not alpha_filter(w)]

total_words = len(alphatestwords)
filter_type = 'Filtering Regex Pt. 1'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)


print(total_words)
print(alphatestwords[ :25])

#10% of the words gone!

920174
['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses']


In [21]:
ndist = FreqDist(alphatestwords)

# print the top 30 tokens by frequency
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

the 	 0.04428510260016041
of 	 0.03091915224729236
and 	 0.030362735743457214
a 	 0.030171467570263885
to 	 0.02122315996757135
's 	 0.01705764344569614
in 	 0.012836702623634226
that 	 0.011007700717473
is 	 0.009662302999215366
it 	 0.008920052077107155
as 	 0.007805045567468761
with 	 0.007050840384535968
for 	 0.006880220480039645
its 	 0.006424871817721431
film 	 0.00549135272241989
an 	 0.0051598936722837205
movie 	 0.004819740614275126
this 	 0.004631732694033955
on 	 0.004396994481478503
be 	 0.00437417271081339
but 	 0.004351350940148276
n't 	 0.0042024660553330135
you 	 0.004164429770891157
by 	 0.0036188807768965433
his 	 0.0035612829747417338


In [22]:
#okay we still have periods, lets try to remove those before including stopwords
#import re
#line = 'Q: Do I write ;/.??? No!!!'
#re.sub('\ |\?|\.|\!|\/|\;|\:', '', line)
#function that takes a word and returns true if it consists only
#of non-alphabetic characters  (assumes import re)
def alpha_filter(w):
  #pattern to match word of non-alphabetical characters
  pattern = re.compile('\ |\?|\.|\!|\/|\;|\:')
  if (pattern.match(w)):
    return True
  else:
    return False

In [23]:
#apply the above function to testwords
alphatestwords = [w for w in alphatestwords if not alpha_filter(w)]

total_words = len(alphatestwords)
filter_type = 'Filtering Regex Pt. 2'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)

print(total_words)
print(alphatestwords[ :25])

#10% of the words gone!

904479
['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses']


In [24]:
ndist = FreqDist(alphatestwords)

# print the top 30 tokens by frequency
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

the 	 0.04505356122143245
of 	 0.03145567779904232
and 	 0.030889606060505552
a 	 0.030695018900383536
to 	 0.021591435511493357
's 	 0.017353636734517883
in 	 0.013059451905461598
that 	 0.01119871218679483
is 	 0.009829968412754747
it 	 0.009074837558417608
as 	 0.007940482863615407
with 	 0.0071731903117706435
for 	 0.006999609720070891
its 	 0.006536359605916776
film 	 0.005586641591457624
an 	 0.005249430887837086
movie 	 0.004903375313301912
this 	 0.004712104979772886
on 	 0.004473293465077685
be 	 0.004450075678926763
but 	 0.004426857892775841
n't 	 0.004275389478362682
you 	 0.0042366931681111445
by 	 0.00368167751821767
his 	 0.0036230802484081997


### Filtering Stop Words

In [25]:
#get a list of stopwords from nltk
nltkstopwords = nltk.corpus.stopwords.words('english')
print(len(nltkstopwords))
print(nltkstopwords)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [26]:
stoppedtestwords = [w for w in alphatestwords if not w in nltkstopwords]
total_words = len(stoppedtestwords)
print(total_words)
# a total of 335,889 characters removed after non-alphabetical and stopwords removed

568580


In [27]:
#Test list for a better frequency distribution
ndist = FreqDist(stoppedtestwords)

# print the top 30 tokens by frequency

filter_type = 'Filtering Stopwords Pt. 1'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)

nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

  ##okay so there is still things that need to be removed
  #example "'s" and "-lrb-"


's 	 0.02760561398571881
film 	 0.008887051953990643
movie 	 0.007800133666326639
n't 	 0.006801153751450983
one 	 0.0045939005944634
like 	 0.0042175243589292625
story 	 0.0030549790706672765
good 	 0.0029600056280558586
much 	 0.0027542298357311197
-rrb- 	 0.0026909142073235077
-lrb- 	 0.00263111611382743
characters 	 0.002363783460550846
little 	 0.002356748390727778
even 	 0.0022072531569875833
funny 	 0.001990924759928242
time 	 0.0019645432480917373
new 	 0.0019557494108129025
way 	 0.0019416792711667663
make 	 0.0019135389918744944
comedy 	 0.0018379119912765134
love 	 0.0017658025255900664
bad 	 0.0017482148510323966
us 	 0.0016954518273593865
enough 	 0.0016778641528017166
never 	 0.0016567589433325125


In [28]:
morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't", "-rrb-", "-lrb-", "like","''","--","``"]

In [29]:
stopwords = nltkstopwords + morestopwords
print(len(stopwords))
print(stopwords)

201
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [30]:
stoppedtestwords = [w for w in alphatestwords if not w in stopwords]
total_words = len(stoppedtestwords)

In [31]:
#Test list for a better frequency distribution
ndist = FreqDist(stoppedtestwords)

# print the top 25 tokens by frequency
filter_type = 'Filtering Stopwords Pt. 2'
df2 = pd.DataFrame([[filter_type, total_words]], columns=['filter_type','num_words'])
df_filter = df_filter.append(df2, ignore_index=True)


df_words = pd.DataFrame(columns=['word','frequency'])
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

    df2 = pd.DataFrame([[item[0], item[1]/total_words]], columns=['word','frequency'])
    df_words = df_words.append(df2, ignore_index=True)

film 	 0.009383681716212804
movie 	 0.008236023829686084
one 	 0.004850618769591895
story 	 0.003225698622810536
good 	 0.0031254178366091723
much 	 0.0029081427998395508
characters 	 0.0024958773454561664
little 	 0.00248844913907088
even 	 0.002330599753383548
funny 	 0.002102182407035997
time 	 0.0020743266330911737
new 	 0.002065041375109566
way 	 0.0020501849623389937
make 	 0.002020472136797849
comedy 	 0.001940618918156022
love 	 0.0018644798027068383
bad 	 0.001845909286743623
us 	 0.0017901977388539764
enough 	 0.0017716272228907608
never 	 0.0017493426037349022
many 	 0.0017474855521385805
life 	 0.0017400573457532944
movies 	 0.0017010592622305417
best 	 0.0016657752819004322
something 	 0.0016620611787077893


In [32]:

px.bar(df_words, x = "word", y = "frequency", color = "word", color_discrete_sequence = my_colorlist, title = "Top 25 Words After Filtering",
       labels={
           "word":"Words",
           "frequency":"Frequency"
       })#.update_layout(yaxis={"tickformat":'0.00%'})

### Stemming and Lemmatize Words
This is merging words together.  This should increase the frequency of words.

In [33]:
# Stemming
stemmedtestwords = [porter.stem(stoppedtestword) for stoppedtestword in stoppedtestwords]
total_words = len(stemmedtestwords)
print(total_words)

# Lemmatize
lemmatizetestwords = [word_lemma.lemmatize(stemmedtestword) for stemmedtestword in stemmedtestwords] 
total_words = len(lemmatizetestwords)
print(total_words)


538488
538488


In [34]:
#Test list for a better frequency distribution
ndist = FreqDist(lemmatizetestwords)

df_words = pd.DataFrame(columns=['word','frequency'])
nitems = ndist.most_common(25)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

    df2 = pd.DataFrame([[item[0], item[1]/total_words]], columns=['word','frequency'])
    df_words = df_words.append(df2, ignore_index=True)

film 	 0.010609335769785028
movi 	 0.009937083091916626
one 	 0.005067893806361516
make 	 0.004133796853411776
charact 	 0.003938806435798012
stori 	 0.0035766813745153093
good 	 0.003181129384498819
much 	 0.0029081427998395508
time 	 0.002898857541857943
littl 	 0.00248844913907088
work 	 0.0024253093847959473
even 	 0.0024197382300069825
way 	 0.0023825971980805513
love 	 0.0022618888443196507
comedi 	 0.002156036903329322
feel 	 0.002152322800136679
funni 	 0.0021188958714028912
look 	 0.002115181768210248
new 	 0.0020706125298985308
get 	 0.0019276195569817712
bad 	 0.0018681939058994815
perform 	 0.0018626227511105169
u 	 0.0018069112032208703
see 	 0.0018031971000282271
enough 	 0.0017716272228907608


In [35]:

px.bar(df_words, x = "word", y = "frequency", color = "word", color_discrete_sequence = my_colorlist, title = "Top 25 Words After Stemming and Lemmatizing",
       labels={
           "word":"Words",
           "frequency":"Frequency"
       })#.update_layout(yaxis={"tickformat":'0.00%'})

In [36]:
starting_words = df_filter['num_words'][0]
df_filter["perc_filter"] = df_filter["num_words"] /starting_words
df_filter

Unnamed: 0,filter_type,num_words,perc_filter
0,Filtering Nothing,981478,1.0
1,Filtering Punctuation,930730,0.948294
2,Filtering Regex Pt. 1,920174,0.937539
3,Filtering Regex Pt. 2,904479,0.921548
4,Filtering Stopwords Pt. 1,568580,0.57931
5,Filtering Stopwords Pt. 2,538488,0.54865


In [37]:
my_colorlist = ['#ffc800','#e98f42','#ca8478','#79af96','#48a8a6']
px.bar(df_filter, x = "filter_type", y = "perc_filter", color = "filter_type", color_discrete_sequence = my_colorlist, title = "Filtering Words",
       labels={
           "filter_type":"Filter Method",
           "perc_filter":"% Filtered"
       }).update_layout(yaxis={"tickformat":',.0%'})

## Processing Sentences

In [4]:
# function to read kaggle training file, train and test a classifier
def processkaggle(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)
  os.chdir(dirPath)

  f = open('./train.tsv', 'r')
  # loop over lines in the file and use the first limit of them
  phrasedata = []
  for line in f:
    # ignore the first line starting with Phrase and read all lines
    if (not line.startswith('Phrase')):
      # remove final end of line character
      line = line.strip()
      # each line has 4 items separated by tabs
      # ignore the phrase and sentence ids, and keep the phrase and sentiment
      phrasedata.append(line.split('\t')[2:4])

  # Pick a random sample of length
  random.shuffle(phrasedata)
  phraselist = phrasedata[:limit]

  # Printing Phrase Data
  # print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')


  return phraselist


## Cleaning Phrases

### Removing Punctuations

In [5]:
def cleaning_phrases(phrase_list):

  # create list of phrase documents as (list of words, label)
  phrasedocs = []
  punction_list = list(punctuation)

  # Loop through data to tokenize, make all lowercase, and expand contractions
  for phrase in phrase_list:

    # Tokenize
    tokens = nltk.word_tokenize(phrase[0])

    # Lowercase
    tokens_lower = [w.lower( ) for w in tokens]

    # Remove puncutation
    tokens_punc = [token_lower for token_lower in tokens_lower if token_lower not in punction_list]


    # Add to list
    # Removing tokens less than 1
    if len(tokens_punc) > 0: 
      phrasedocs.append((tokens_punc, int(phrase[1])))
      
  return phrasedocs

### Filtering Text

In [6]:
def filter_phrases(phrase_list):

  phrases_filtered = []
  pattern = re.compile(r'^[^a-z]+$')

  morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't", "-rrb-", "-lrb-", "like","''","--","``"]

  stopwords = nltk.corpus.stopwords.words('english')
  complete_stopwords = stopwords + morestopwords

  # Loop through data to tokenize, make all lowercase, and expand contractions

  for phrase in phrase_list:
    # Tokenize
    tokens = phrase[0]

    # Remove Non-alphabetical characters
    tokens_chr = [token for token in tokens if not pattern.match(token)]

    # Remove words under three characters
    tokens_three = [token_chr for token_chr in tokens_chr if len(tokens_chr) > 2]

    # Remove stopwords
    tokens_stop = [token_three for token_three in tokens_three if not token_three in complete_stopwords]

    # Add to list
    # Removing tokens less than 1
    if len(tokens_stop) > 0: 
      phrases_filtered.append((tokens_stop, int(phrase[1])))
    
      
  return phrases_filtered

### Stem and Lemmatize

In [7]:
# Combining Stemming and Lemmatize filter into a function.
# Created a word call stemmatize that filters these out.

porter = nltk.PorterStemmer()
word_lemma = nltk.WordNetLemmatizer()

def stemmatize_phrases(phrase_list):
  token_list = []
  # Loop through data to tokenize, make all lowercase, and expand contractions

  for phrase in phrase_list:
    # Tokenize
    tokens = phrase[0]

    # Stemming
    stems = [porter.stem(token) for token in tokens]

    # Lemmatize
    lemmas = [word_lemma.lemmatize(stem) for stem in stems] 

    # Add to list
    token_list.append((lemmas, int(phrase[1])))

  return token_list

## Applying Functions

In [8]:
imbd_list = processkaggle(data_path, 10000)

imbd_clean = cleaning_phrases(imbd_list)
imbd_filter = filter_phrases(imbd_clean)
imbd_stemma = stemmatize_phrases(imbd_filter)

Read 156060 phrases, using 10000 random phrases


In [9]:
print(imbd_list[17])
print(imbd_clean[17])
print(imbd_filter[14])
print(imbd_stemma[14])


['The rollerball sequences', '2']
(['the', 'rollerball', 'sequences'], 2)
(['house', 'games'], 2)
(['hous', 'game'], 2)


# Feature Engineering

## Subjectivity Feature Libraries

In [10]:
py_path = '/content/drive/MyDrive/IST 664 Project - DL/kagglemoviereviews/'

import sys  
sys.path.insert(0, py_path)

In [11]:
# from classifyKaggle.crossval import cross_validation_PRF
from sentiment_read_subjectivity import readSubjectivity
from sentiment_read_subjectivity import read_subjectivity_three_types
import sentiment_read_LIWC_pos_neg_words
import sentiment_read_LIWC_pos_neg_words

## Word List

In [12]:
# get all words from all movie_reviews and put into a frequency distribution
# note lowercase, but no stemming or stopwords

all_words_list = [word for (sent, cat) in imbd_stemma for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

7140


## Unigram Feature

In [224]:
# get the 1000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]

In [225]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

Creating variable to store documents so we can replace this instead of all the individual features

In [235]:
documents = imbd_clean

In [236]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]
len(featuresets)

9998

In [237]:
featuresets[0]

({'V_film': False,
  'V_movi': False,
  'V_one': False,
  'V_stori': False,
  'V_make': False,
  'V_charact': False,
  'V_time': False,
  'V_good': False,
  'V_even': False,
  'V_work': False,
  'V_comedi': False,
  'V_love': False,
  'V_feel': False,
  'V_way': False,
  'V_look': False,
  'V_much': False,
  'V_littl': False,
  'V_perform': False,
  'V_take': False,
  'V_see': False,
  'V_u': False,
  'V_funni': False,
  'V_come': False,
  'V_life': False,
  'V_new': False,
  'V_get': False,
  'V_watch': False,
  'V_director': False,
  'V_thing': False,
  'V_act': False,
  'V_enough': False,
  'V_entertain': False,
  'V_interest': False,
  'V_audienc': False,
  'V_action': False,
  'V_well': False,
  'V_better': False,
  'V_best': False,
  'V_emot': False,
  'V_peopl': False,
  'V_seem': False,
  'V_play': False,
  'V_made': False,
  'V_cast': False,
  'V_end': False,
  'V_never': False,
  'V_mani': False,
  'V_two': False,
  'V_may': False,
  'V_bad': False,
  'V_drama': False,
  'V_l

## Bigram Features

In [238]:
####   adding Bigram features   ####
# set up for using bigrams
# from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [239]:
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['phrase', 'fatal', 'script', 'error', 'susan', 'sarandon', 'dustin', 'hoffman', 'holli', 'hunter', 'tuba-play', 'dwarf', 'roll', 'hill', 'trash', 'candi', 'cheeki', 'work', 'probabl', 'product', 'love', 'well', 'integr', 'homag', 'least', 'bit', 'mesmer', 'emerg', 'numbingli', 'dull', 'experi', 'rock', 'solid', 'famili', 'fun', 'nobl', 'endeavor', 'bolster', 'astonish', 'voic', 'cast', 'except', 'love', 'hewitt', 'interest', 'racial', 'tension', 'storylin', 'encount', 'sinc']


In [240]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])

[('10-year', 'delay'), ('18-year-old', 'mistress'), ('22-year-old', 'girlfriend'), ('4ever', 'sledgehamm'), ('50-someth', 'lovebird'), ('75-minut', 'sampl'), ('7th-centuri', 'oral'), ('91-minut', 'trailer'), ('a.e.w', 'mason'), ('abbott', 'ernest'), ('abdul', 'malik'), ('ace', 'japanim'), ('acerb', 'reparte'), ('acr', 'haut'), ('actorish', 'notat'), ('actuari', 'maelstrom'), ('affluent', 'damsel'), ('agenc', 'bos'), ('ahola', 'thirteen'), ('aircraft', 'carrier'), ('aisl', 'walker'), ('alert', 'street-smart'), ('alexandr', 'desplat'), ('ali', 'graduat'), ('all-night', 'tequila'), ('aloft', 'self-referenti'), ('amir', 'mann'), ('analges', 'balm'), ('analyt', 'venic'), ('annal', 'white-on-black'), ('anne-sophi', 'birot'), ('annual', 'riviera'), ('arcan', 'area'), ('arliss', 'howard'), ('arni', 'musclefest'), ('ash', 'wednesday'), ('asset', 'detriment'), ('atop', 'undercurr'), ('aureli', 'christel'), ('auschwitz', 'ii-birkenau'), ('australian', 'actor\\/director'), ('automat', 'gunfir'), (

In [241]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features

def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    #for word in word_features:
    #    features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [242]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

In [243]:
# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

500


In [244]:
# features in document 0
print(bigram_featuresets[0][0])

{'B_10-year_delay': False, 'B_18-year-old_mistress': False, 'B_22-year-old_girlfriend': False, 'B_4ever_sledgehamm': False, 'B_50-someth_lovebird': False, 'B_75-minut_sampl': False, 'B_7th-centuri_oral': False, 'B_91-minut_trailer': False, 'B_a.e.w_mason': False, 'B_abbott_ernest': False, 'B_abdul_malik': False, 'B_ace_japanim': False, 'B_acerb_reparte': False, 'B_acr_haut': False, 'B_actorish_notat': False, 'B_actuari_maelstrom': False, 'B_affluent_damsel': False, 'B_agenc_bos': False, 'B_ahola_thirteen': False, 'B_aircraft_carrier': False, 'B_aisl_walker': False, 'B_alert_street-smart': False, 'B_alexandr_desplat': False, 'B_ali_graduat': False, 'B_all-night_tequila': False, 'B_aloft_self-referenti': False, 'B_amir_mann': False, 'B_analges_balm': False, 'B_analyt_venic': False, 'B_annal_white-on-black': False, 'B_anne-sophi_birot': False, 'B_annual_riviera': False, 'B_arcan_area': False, 'B_arliss_howard': False, 'B_arni_musclefest': False, 'B_ash_wednesday': False, 'B_asset_detrimen

## POS Features

In [24]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features

def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [25]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

1004


In [26]:
# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])



(['phrase', 'fatal', 'script', 'error'], 1)
num nouns 3
num verbs 0
num adjectives 1
num adverbs 0


## LIWC Features

In [245]:
LIWC_path = sentiment_read_LIWC_pos_neg_words.read_words()
LIWC_poslist = LIWC_path[0]
LIWC_neglist = LIWC_path[1]

In [246]:
# define features that include word counts of subjectivity words
# negative feature will have number of weakly negative words +
#    2 * number of strongly negative words
# positive feature has similar definition
#    not counting neutral words

def LIWC_features(document, word_features, poslist, neglist):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 2 classes of LWIC
    pos = 0
    neg = 0
    for word in document_words:
      if sentiment_read_LIWC_pos_neg_words.isPresent(word,poslist):
        pos += 1
      if sentiment_read_LIWC_pos_neg_words.isPresent(word,neglist):
        neg += 1     
    return features



In [247]:
LIWC_featuresets = [(LIWC_features(d, word_features, LIWC_poslist, LIWC_neglist), c) for (d, c) in documents]
len(LIWC_featuresets)

9998

## Subjectivity Features

In [30]:
####   adding features   ####
# First run the program in the file Subjectivity.py to load the subjectivity lexicon
# copy and paste the definition of the readSubjectivity functions

# create a path to where the subjectivity file resides on your disk
# this example is for my mac
# nancymacpath = "/Users/njmccrac1/AAAdocs/research/subjectivitylexicon/hltemnlp05clues/subjclueslen1-HLTEMNLP05.tff"

# create your own path to the subjclues file
SLpath = "/content/drive/MyDrive/IST 664 Project - DL/subjclueslen1-HLTEMNLP05.tff"

In [31]:
# import the Subjectivity program as a module to use the function

import Subjectivity
SL = Subjectivity.readSubjectivity(SLpath)

In [32]:
# define features that include word counts of subjectivity words
# negative feature will have number of weakly negative words +
#    2 * number of strongly negative words
# positive feature has similar definition
#    not counting neutral words
def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]

In [33]:
# this gives the label of document 0
SL_featuresets[0][1]
# number of features for document 0
len(SL_featuresets[0][0].keys())

1002

# Features Experiment Loop

## Loop

In [34]:
### Build a loop that runs each filter through every Featureset
### Run an accuracy score and append to a dataframe

filter_list = [imbd_clean, imbd_filter, imbd_stemma]
filter_names = ['imbd_clean', 'imbd_filter', 'imbd_stemma']

column_headers = ['Model','Feature','Filter','Experiment','Accuracy']
df_accuracys = pd.DataFrame(columns=column_headers) 

for documents, names in zip(filter_list, filter_names):
# for documents in filter_list:

  # Unigram 
  ## Unigram Features
  featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

  ## Train and Test Set
  train_set, test_set = featuresets[500:], featuresets[:500]
  ## NaiveBayesClassifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)

  # Accuracy Score
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'NLTK - Bayes'
  Feature_Name = 'Unigram'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Random Forest
  classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'Scikit - Random Forest'
  Feature_Name = 'Unigram'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Bigram
  ## Bigram Features
  bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

  ## Train and Test Set
  train_set, test_set = bigram_featuresets[500:], bigram_featuresets[:500]

  ## NaiveBayesClassifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)

  ## Accuracy Score
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'NLTK - Bayes'
  Feature_Name = 'Bigram'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)  
  
  # Random Forest
  classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'Scikit - Random Forest'
  Feature_Name = 'Bigram'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # POS
  ## POS Features
  # define feature sets using this function
  POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]

  ## Train and Test Set
  train_set, test_set = POS_featuresets[500:], POS_featuresets[:500]

  ## NaiveBayesClassifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)

  ## Accuracy Score
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'NLTK - Bayes'
  Feature_Name = 'POS'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Random Forest
  classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'Scikit - Random Forest'
  Feature_Name = 'POS'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # LIWC 
  LIWC_featuresets = [(LIWC_features(d, word_features, LIWC_poslist, LIWC_neglist), c) for (d, c) in documents]

  ## Train and Test Set
  train_set, test_set = LIWC_featuresets[500:], LIWC_featuresets[:500]

  ## NaiveBayesClassifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)

  ## Accuracy Score
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'NLTK - Bayes'
  Feature_Name = 'LWIC'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Random Forest
  classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'Scikit - Random Forest'
  Feature_Name = 'LWIC'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Subjectivity
  SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]

  ## Train and Test Set
  train_set, test_set = SL_featuresets[500:], SL_featuresets[:500]

  ## NaiveBayesClassifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)

  ## Accuracy Score
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'NLTK - Bayes'
  Feature_Name = 'Subjectivity'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)

  # Random Forest
  classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
  a_score = nltk.classify.accuracy(classifier, test_set)

  # Dataframe
  Filter = names
  Model = 'Scikit - Random Forest'
  Feature_Name = 'Subjectivity'
  Experiment = Model + ' ' + Feature_Name + ' ' + Filter

  df = pd.DataFrame([[Model, Feature_Name, Filter, Experiment, a_score]], columns = column_headers)
  df_accuracys = df_accuracys.append(df, ignore_index=True)


## Visual

In [201]:
  df_accuracys.sort_values(by=['Accuracy'], inplace = True, ascending= False)

In [40]:
df_accuracys.to_csv( '/content/drive/MyDrive/IST 664 Project - DL/accuracys.csv',index=False)

In [207]:
px.bar(df_accuracys, x = "Experiment", y = "Accuracy", color = "Model", color_discrete_sequence = my_colorlist2, title = "Accuracy by Filter and Feature Engineering",
            labels={
                "word":"Words",
                "frequency":"Frequency"
            }).update_layout(
                barmode='stack', xaxis={'categoryorder':'total descending',
                                        'tickangle':45}
                )

In [221]:
px.bar(df_accuracys, x = "Accuracy", y = "Experiment", color = "Model", orientation='h',
       color_discrete_sequence = my_colorlist2, title = "Accuracy by Filter and Feature Engineering",
            labels={
                "word":"Words",
                "frequency":"Frequency"
            }).update_layout(
                barmode='stack', xaxis={'categoryorder':'total descending'}
                )

In [202]:
top_5 = df_accuracys[df_accuracys["Experiment"].isin(["NLTK - Bayes LWIC imbd_clean","NLTK - Bayes Unigram imbd_clean","NLTK - Bayes POS imbd_clean",
                                                      "NLTK - Bayes Subjectivity imbd_stemma","Scikit - Random Forest Bigram imbd_clean"])]

In [203]:
top_5

Unnamed: 0,Model,Feature,Filter,Experiment,Accuracy
6,NLTK - Bayes,LWIC,imbd_clean,NLTK - Bayes LWIC imbd_clean,0.564
4,NLTK - Bayes,POS,imbd_clean,NLTK - Bayes POS imbd_clean,0.552
2,NLTK - Bayes,Bigram,imbd_clean,NLTK - Bayes Bigram imbd_clean,0.538
3,Scikit - Random Forest,Bigram,imbd_clean,Scikit - Random Forest Bigram imbd_clean,0.538
28,NLTK - Bayes,Subjectivity,imbd_stemma,NLTK - Bayes Subjectivity imbd_stemma,0.538


In [204]:
px.bar(top_5, x = "Experiment", y = "Accuracy", color = "Model", color_discrete_sequence = my_colorlist2, title = "Accuracy by Filter and Feature Engineering",
            labels={
                "word":"Words",
                "frequency":"Frequency"
            }).update_layout(
                barmode='stack', xaxis={'categoryorder':'total descending',
                                        'tickangle':45}
                )

# Experiments - NLTK

## Bayes Classifier - Ngrams

In [65]:
## Training using naive Baysian classifier, training set is 90% of data
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

# the accuracy result may vary since we randomized the documents

0.538

## Bayes Classifier - Bigram

In [66]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[500:], bigram_featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.544

## Bayes Classifier - POS

In [67]:
# train and test the classifier
train_set, test_set = POS_featuresets[500:], POS_featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.546

## Bayes Classifier - Subjectivity Lexicon

In [68]:
# retrain the classifier using these features
train_set, test_set = SL_featuresets[500:], SL_featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.564

## Bayes Classifier - LWIC

In [106]:
# retrain the classifier using these features
train_set, test_set = LIWC_featuresets[500:], LIWC_featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.538

# Experiments - SklearnClassifier

## Random Forest Classifier - POS

In [69]:
# train a classifier and report accuracy
train_set, test_set = POS_featuresets[500:], POS_featuresets[:500]
classifier = SklearnClassifier(RandomForestClassifier()).train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.578

## Support Vector Machine Classifier - POS

In [88]:
# train a classifier and report accuracy
train_set, test_set = POS_featuresets[500:], POS_featuresets[:500]
classifier = SklearnClassifier(LinearSVC()).train(train_set)
nltk.classify.accuracy(classifier, test_set)



Liblinear failed to converge, increase the number of iterations.



0.546

## Droping Features

In [129]:
featuresets = None
bigram_featuresets = None
LIWC_featuresets = None
POS_featuresets = None
SL_featuresets = None

# Cross Validation

### Cross Validation Function

In [69]:
## cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end

def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [70]:
len(featuresets)

7044

### Unigram Features

In [71]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 3
cross_validation_accuracy(num_folds, featuresets)

Each fold size: 2348
0 0.4889267461669506
1 0.4735945485519591
2 0.479557069846678
mean accuracy 0.48069278818852923


### Bigram Features

In [73]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 3
cross_validation_accuracy(num_folds, bigram_featuresets)

Each fold size: 3332
0 0.5195078031212484
1 0.5225090036014406
2 0.5093037214885955
mean accuracy 0.5171068427370948


### POS Features

In [74]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 3
cross_validation_accuracy(num_folds, POS_featuresets)

Each fold size: 3332
0 0.5456182472989196
1 0.5312124849939976
2 0.508703481392557
mean accuracy 0.5285114045618248


### Subjectivity Lexicon

In [75]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 3
cross_validation_accuracy(num_folds, SL_featuresets)

Each fold size: 3332
0 0.5447178871548619
1 0.53031212484994
2 0.5177070828331333
mean accuracy 0.5309123649459785


## Cross Validation - Final

### Cross Validation New Function

In [145]:
## cross-validation Naive Bayes##

def nltk_cross_validation_accuracy(num_folds, featuresets, experiment):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []

    column_headers = ['Experiment','Fold','Accuracy']
    df_crossval = pd.DataFrame(columns=column_headers) 

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)

        # Dataframe
        Experiment = experiment
        Fold = i
        a_score = accuracy_this_round

        df2 = pd.DataFrame([[Experiment, Fold, a_score]], columns = column_headers)
        df_crossval = df_crossval.append(df2, ignore_index=True)

    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)
    return(df_crossval)

In [146]:

## cross-validation SciKit Learn##

def scikt_cross_validation_accuracy(num_folds, featuresets, experiment):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []

    column_headers = ['Experiment','Fold','Accuracy']
    df_crossval = pd.DataFrame(columns=column_headers) 

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = SklearnClassifier(RandomForestClassifier()).train(train_this_round)
        
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)

        # Dataframe
        Experiment = experiment
        Fold = i
        a_score = accuracy_this_round

        df2 = pd.DataFrame([[Experiment, Fold, a_score]], columns = column_headers)
        df_crossval = df_crossval.append(df2, ignore_index=True)

    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)
    return(df_crossval)

## Cross Validation Data Frame

### Setting Up Features to Run

In [140]:

# Experiments
## LWIC
lwic_clean = [(LIWC_features(d, word_features, LIWC_poslist, LIWC_neglist), c) for (d, c) in imbd_clean]

## Unigram
unigram_clean = [(document_features(d, word_features), c) for (d, c) in imbd_clean]

## POS
pos_clean = [(POS_features(d, word_features), c) for (d, c) in imbd_clean]

## Bigram
bigram_clean = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in imbd_clean]

## Subjectivity
SL_stemma = [(SL_features(d, word_features, SL), c) for (d, c) in imbd_stemma]



## Cross Val top 5 Accuracy

### LWIC

In [163]:
# perform the cross-validation on the featuresets with word features and generate accuracy
crossval_featuresets = lwic_clean 
random.shuffle(crossval_featuresets)
crossval_featuresets = crossval_featuresets[:3600]

num_folds = 12
df_lwic_cross = nltk_cross_validation_accuracy(num_folds, crossval_featuresets, 'NLTK - Bayes LWIC Clean')

Each fold size: 300
0 0.5233333333333333
1 0.49666666666666665
2 0.5466666666666666
3 0.52
4 0.5
5 0.5133333333333333
6 0.5033333333333333
7 0.48
8 0.5133333333333333
9 0.5466666666666666
10 0.55
11 0.4866666666666667
mean accuracy 0.515


### Unigram

In [164]:
# perform the cross-validation on the featuresets with word features and generate accuracy
crossval_featuresets = unigram_clean 
random.shuffle(crossval_featuresets)
crossval_featuresets = crossval_featuresets[:3600]

num_folds = 12
df_unigram_cross = scikt_cross_validation_accuracy(num_folds, crossval_featuresets, 'Scikit - Random Forest Bigram Clean')

Each fold size: 300
0 0.5333333333333333
1 0.5166666666666667
2 0.48
3 0.4633333333333333
4 0.5166666666666667
5 0.47333333333333333
6 0.53
7 0.47333333333333333
8 0.49666666666666665
9 0.5466666666666666
10 0.5166666666666667
11 0.5033333333333333
mean accuracy 0.5041666666666667


### POS

In [165]:
# perform the cross-validation on the featuresets with word features and generate accuracy
crossval_featuresets = pos_clean 
random.shuffle(crossval_featuresets)
crossval_featuresets = crossval_featuresets[:3600]

num_folds = 12
df_pos_cross = nltk_cross_validation_accuracy(num_folds, crossval_featuresets, 'NLTK - Bayes POS Clean')

Each fold size: 300
0 0.5333333333333333
1 0.54
2 0.53
3 0.52
4 0.5033333333333333
5 0.53
6 0.5333333333333333
7 0.5466666666666666
8 0.5633333333333334
9 0.5266666666666666
10 0.48
11 0.48333333333333334
mean accuracy 0.5241666666666667


### SciKit

In [166]:
# perform the cross-validation on the featuresets with word features and generate accuracy
crossval_featuresets = bigram_clean 
random.shuffle(crossval_featuresets)
crossval_featuresets = crossval_featuresets[:3600]

num_folds = 12
df_scikit_cross = nltk_cross_validation_accuracy(num_folds, crossval_featuresets, 'NLTK - Bayes Bigram Clean')

Each fold size: 300
0 0.45
1 0.5333333333333333
2 0.5166666666666667
3 0.5433333333333333
4 0.5233333333333333
5 0.55
6 0.5333333333333333
7 0.5533333333333333
8 0.5166666666666667
9 0.5633333333333334
10 0.5166666666666667
11 0.48333333333333334
mean accuracy 0.5236111111111111


### Subjectivity

In [167]:
# perform the cross-validation on the featuresets with word features and generate accuracy
crossval_featuresets = SL_stemma 
random.shuffle(crossval_featuresets)
crossval_featuresets = crossval_featuresets[:3600]

num_folds = 12
df_sl_cross = nltk_cross_validation_accuracy(num_folds, crossval_featuresets, 'NLTK - Bayes SL Stemma')

Each fold size: 300
0 0.45666666666666667
1 0.4633333333333333
2 0.45666666666666667
3 0.4533333333333333
4 0.4533333333333333
5 0.4633333333333333
6 0.45
7 0.47
8 0.5066666666666667
9 0.4666666666666667
10 0.4533333333333333
11 0.49666666666666665
mean accuracy 0.4658333333333334


## Final CrossVal

In [168]:
df_full_cross = df_lwic_cross.append(df_unigram_cross, ignore_index=True)
df_full_cross = df_full_cross.append(df_pos_cross, ignore_index=True)
df_full_cross = df_full_cross.append(df_scikit_cross, ignore_index=True)
df_full_cross = df_full_cross.append(df_sl_cross, ignore_index=True)

In [169]:
df_full_cross[:12]

Unnamed: 0,Experiment,Fold,Accuracy
0,NLTK - Bayes LWIC Clean,0,0.523333
1,NLTK - Bayes LWIC Clean,1,0.496667
2,NLTK - Bayes LWIC Clean,2,0.546667
3,NLTK - Bayes LWIC Clean,3,0.52
4,NLTK - Bayes LWIC Clean,4,0.5
5,NLTK - Bayes LWIC Clean,5,0.513333
6,NLTK - Bayes LWIC Clean,6,0.503333
7,NLTK - Bayes LWIC Clean,7,0.48
8,NLTK - Bayes LWIC Clean,8,0.513333
9,NLTK - Bayes LWIC Clean,9,0.546667


In [178]:
df_agg_cross = df_full_cross
df_agg_cross["Avg Accuracy"] = df_agg_cross["Accuracy"]
df_agg_cross["Stdv Accuracy"] = df_agg_cross["Accuracy"]

df_agg_cross = df_agg_cross.groupby(["Experiment"]).agg(
                                                        {
                                                          "Avg Accuracy":np.mean,
                                                          "Stdv Accuracy":np.std
                                                         }
                  ).reset_index()

df_agg_cross

Unnamed: 0,Experiment,Avg Accuracy,Stdv Accuracy
0,NLTK - Bayes Bigram Clean,0.523611,0.031509
1,NLTK - Bayes LWIC Clean,0.515,0.023463
2,NLTK - Bayes POS Clean,0.524167,0.0245
3,NLTK - Bayes SL Stemma,0.465833,0.017929
4,Scikit - Random Forest Bigram Clean,0.504167,0.026973


In [179]:
px.box(df_full_cross, x="Experiment", y="Accuracy")