# File Path Locations

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to all the input data
data_path = '/content/drive/MyDrive/IST 664 Project - DL/kagglemoviereviews/corpus/'

# Library

In [None]:
# Stats Packages
import pandas as pd
import numpy as np
import plotly.express as px

# NLP Packages
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist

## Downloadable
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Reviewing the Data

In [None]:
df_train = pd.read_csv(data_path+'train.tsv', sep='\t')
df_test = pd.read_csv(data_path+'test.tsv', sep='\t')

In [None]:
unique_sentence = len(np.unique(df_train["SentenceId"]))
total_sentence = len(df_train["SentenceId"])

print(unique_sentence)
print(total_sentence)

total_sentence/ unique_sentence

8529
156060


18.297572986282095

In [None]:
df_train['FullSentenceId'] = df_train.sort_values(['PhraseId'], ascending=[True]) \
             .groupby(['SentenceId']) \
             .cumcount() + 1

df_train[:75]


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,FullSentenceId
0,1,1,A series of escapades demonstrating the adage ...,1,1
1,2,1,A series of escapades demonstrating the adage ...,2,2
2,3,1,A series,2,3
3,4,1,A,2,4
4,5,1,series,2,5
...,...,...,...,...,...
70,71,2,introspective and entertaining,3,8
71,72,2,introspective and,3,9
72,73,2,introspective,2,10
73,74,2,and,2,11


In [None]:
parent_df = df_train[df_train["FullSentenceId"]=="1"]

In [None]:
df_test['FullSentenceId'] = df_test.sort_values(['PhraseId'], ascending=[True]) \
             .groupby(['SentenceId']) \
             .cumcount() + 1
df_test[:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,FullSentenceId
0,156061,8545,An intermittently pleasing but mostly routine ...,1
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,3
3,156064,8545,intermittently pleasing but mostly routine effort,4
4,156065,8545,intermittently pleasing but mostly routine,5
5,156066,8545,intermittently pleasing but,6
6,156067,8545,intermittently pleasing,7
7,156068,8545,intermittently,8
8,156069,8545,pleasing,9
9,156070,8545,but,10


In [None]:
imdb_df_hist = df_train.groupby(["SentenceId"]).agg({"PhraseId":len}).reset_index()
imdb_df_box = df_train.groupby(["SentenceId","Sentiment"]).agg({"PhraseId":len}).reset_index()

In [None]:
px.histogram(imdb_df_hist, x = "PhraseId",
             title = "Phrase Histogram",
             labels={"PhraseId":"# Phrases"})

In [None]:
px.box(imdb_df_box, x = "Sentiment", y = "PhraseId", title = "# of Phrases by Sentiment", labels={"PhraseId":"# of Phrases"})

In [None]:
imdb_df_hist["PhraseId"].describe()

count    8529.000000
mean       18.297573
std         9.950209
min         1.000000
25%        11.000000
50%        17.000000
75%        25.000000
max        63.000000
Name: PhraseId, dtype: float64

## Ngrams

In [None]:
#Take the training dataset, tokenize it, and reduce the tokens to lowercase.
# file0 = nltk.corpus.gutenberg.fileids( ) [0]
# emmatext = nltk.corpus.gutenberg.raw(file0)
testtext = ''.join(df_train["Phrase"])
testtokens = nltk.word_tokenize(testtext) 
testwords = [w.lower( ) for w in testtokens] 
# show some of the words
total_words = len(testtokens)
print(total_words)
print(testtokens[ :25])

981478
['A', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally']


Need to get rid of punctuation and apostrophies

This is our Bag of words feature

In [None]:
ndist = FreqDist(testwords)

# print the top 30 tokens by frequency
nitems = ndist.most_common(30)
for item in nitems:
    print (item[0], '\t', item[1]/total_words)

, 	 0.04279464236590122
the 	 0.04151901519952561
of 	 0.028987914145808667
and 	 0.02846625191802567
a 	 0.028286930527225265
to 	 0.019897542278074495
's 	 0.01599220767047249
in 	 0.012034910614399916
that 	 0.010320149814871041
is 	 0.009058786850036374
it 	 0.008362897589146165
as 	 0.007317535390502895
with 	 0.006610438542687661
for 	 0.006450475711121391
its 	 0.00602356853643179
film 	 0.005148357884741176
an 	 0.004837602065456383
movie 	 0.004518695273862481
this 	 0.004342430497677992
on 	 0.00412235424533204
be 	 0.004100957943020628
but 	 0.004079561640709216
n't 	 0.003939976239915719
you 	 0.003904315736063366
-- 	 0.0034967671206078996
by 	 0.0033928422236667558
his 	 0.0033388420321189063
more 	 0.0032451058505641494
or 	 0.0031870301728617453
not 	 0.003169709356704888




### Filtering

In [None]:
#Now we want to remove non-alphabetical
#Regular Expression
import re
#   lower-case characters [^a-z]+
# the beginning ^ and ending $ require the match to begin and end on a word boundary 
pattern = re.compile('^[^a-z]+$')

In [None]:
#function that takes a word and returns true if it consists only
#of non-alphabetic characters  (assumes import re)
def alpha_filter(w):
  #pattern to match word of non-alphabetical characters
  pattern = re.compile('^[^a-z]+$')
  if (pattern.match(w)):
    return True
  else:
    return False

In [None]:
#apply the above function to testwords
alphatestwords = [w for w in testwords if not alpha_filter(w)]
print(alphatestwords[:100])
print(len(alphatestwords))

#10% of the words gone!

['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goosea', 'seriesaseriesof', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'gooseofescapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'gooseescapadesdemonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goosedemonstrating', 'the', 'adagedemonstratingthe', 'adagetheadagethat', 'what', 'is', 'good', 'for', 'the', 'goosethatwhat', 'is', 'good', 'for', 'the', 'goosewhatis', 'good', 'for', 'the', 'gooseisgood', 'for', 'the']
920174


In [None]:
#okay we still have periods, lets try to remove those before including stopwords
#import re
#line = 'Q: Do I write ;/.??? No!!!'
#re.sub('\ |\?|\.|\!|\/|\;|\:', '', line)
#function that takes a word and returns true if it consists only
#of non-alphabetic characters  (assumes import re)
def alpha_filter(w):
  #pattern to match word of non-alphabetical characters
  pattern = re.compile('\ |\?|\.|\!|\/|\;|\:')
  if (pattern.match(w)):
    return True
  else:
    return False

In [None]:
#apply the above function to testwords
alphatestwords = [w for w in alphatestwords if not alpha_filter(w)]
print(alphatestwords[:100])
print(len(alphatestwords))

#10% of the words gone!

['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goosea', 'seriesaseriesof', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'gooseofescapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'gooseescapadesdemonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goosedemonstrating', 'the', 'adagedemonstratingthe', 'adagetheadagethat', 'what', 'is', 'good', 'for', 'the', 'goosethatwhat', 'is', 'good', 'for', 'the', 'goosewhatis', 'good', 'for', 'the', 'gooseisgood', 'for', 'the', 'goosegoodfor']
904479


In [None]:
#get a list of stopwords from nltk
nltkstopwords = nltk.corpus.stopwords.words('english')
print(len(nltkstopwords))
print(nltkstopwords)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [None]:
stoppedtestwords = [w for w in alphatestwords if not w in nltkstopwords]
print(len(stoppedtestwords))

#a total of 335,889 characters removed after non-alphabetical and stopwords removed

568580


In [None]:
#Test list for a better frequency distribution
testdist = FreqDist(stoppedtestwords)
testitems = testdist.most_common(30)
for item in testitems:
  print(item)

  ##okay so there is still things that need to be removed
  #example "'s" and "-lrb-"

("'s", 15696)
('film', 5053)
('movie', 4435)
("n't", 3867)
('one', 2612)
('like', 2398)
('story', 1737)
('good', 1683)
('much', 1566)
('-rrb-', 1530)
('-lrb-', 1496)
('characters', 1344)
('little', 1340)
('even', 1255)
('funny', 1132)
('time', 1117)
('new', 1112)
('way', 1104)
('make', 1088)
('comedy', 1045)
('love', 1004)
('bad', 994)
('us', 964)
('enough', 954)
('never', 942)
('many', 941)
('life', 937)
('movies', 916)
('best', 897)
('something', 895)


In [None]:
morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't", "-rrb-", "-lrb-", "like"]

In [None]:
stopwords = nltkstopwords + morestopwords
print(len(stopwords))
print(stopwords)

198
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [None]:
stoppedtestwords = [w for w in alphatestwords if not w in stopwords]
print(len(stoppedtestwords))

538488


In [None]:
#Test list for a better frequency distribution
testdist = FreqDist(stoppedtestwords)
testitems = testdist.most_common(100)
for item in testitems:
  print(item)

('film', 5053)
('movie', 4435)
('one', 2612)
('story', 1737)
('good', 1683)
('much', 1566)
('characters', 1344)
('little', 1340)
('even', 1255)
('funny', 1132)
('time', 1117)
('new', 1112)
('way', 1104)
('make', 1088)
('comedy', 1045)
('love', 1004)
('bad', 994)
('us', 964)
('enough', 954)
('never', 942)
('many', 941)
('life', 937)
('movies', 916)
('best', 897)
('something', 895)
('two', 829)
('well', 805)
('action', 797)
('director', 796)
('work', 791)
('character', 777)
('made', 771)
('see', 767)
('people', 762)
('really', 756)
('makes', 747)
('better', 731)
('without', 730)
('may', 724)
('plot', 723)
('great', 711)
('first', 709)
('every', 685)
('world', 680)
('big', 672)
('long', 659)
('ever', 654)
('sense', 653)
('look', 649)
('still', 634)
('real', 628)
('human', 625)
('feel', 618)
('audience', 618)
('hollywood', 615)
('get', 606)
('kind', 600)
('films', 584)
('man', 583)
('old', 583)
('another', 571)
('also', 564)
('rather', 564)
('often', 557)
('nothing', 551)
('young', 551)
('

In [125]:
stoppedtestwords[:10]

['series',
 'escapades',
 'demonstrating',
 'adage',
 'good',
 'goose',
 'also',
 'good',
 'gander',
 'occasionally']

In [126]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [127]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, stoppedtestwords), c) for (d, c) in documents]

NameError: ignored

# Importing .py Files

# NLP

In [129]:
py_path = '/content/drive/MyDrive/IST 664 Project - DL/kagglemoviereviews/'

import sys  
sys.path.insert(0, py_path)

In [140]:
from classifyKaggle import processkaggle
from sentiment_read_subjectivity import readSubjectivity
from sentiment_read_subjectivity import read_subjectivity_three_types
from sentiment_read_LIWC_pos_neg_words import read_words

## Process Kaggle

In [135]:
phrase_data = processkaggle(data_path, 156060)
phrase_data[:3]

Read 156060 phrases, using 156060 random phrases
['in which Adam Sandler will probably ever appear', '2']
['of its fans', '2']
['Crispin', '2']
["you 've never come within a mile of The Longest Yard", '1']
["the George Pal version of H.G. Wells ' ` The Time", '2']
['remarkably original', '4']
['that asks you to not only suspend your disbelief but your intelligence as well', '1']
['a Jerry Bruckheimer', '2']
["can not guess why the cast and crew did n't sign a pact to burn the negative and the script and pretend the whole thing never existed .", '0']
['his fan base', '2']
(['in', 'which', 'Adam', 'Sandler', 'will', 'probably', 'ever', 'appear'], 2)
(['of', 'its', 'fans'], 2)
(['Crispin'], 2)
(['you', "'ve", 'never', 'come', 'within', 'a', 'mile', 'of', 'The', 'Longest', 'Yard'], 1)
(['the', 'George', 'Pal', 'version', 'of', 'H.G', '.', 'Wells', "'", '`', 'The', 'Time'], 2)
(['remarkably', 'original'], 4)
(['that', 'asks', 'you', 'to', 'not', 'only', 'suspend', 'your', 'disbelief', 'but'

[(['in', 'which', 'Adam', 'Sandler', 'will', 'probably', 'ever', 'appear'], 2),
 (['of', 'its', 'fans'], 2),
 (['Crispin'], 2)]

In [145]:
read_words(stoppedtestwords)

TypeError: ignored