## Clean Data

In [1]:
""" CLEAN TEXT FUNCTION """

def clean_text(text):
    import re
    from unicodedata import normalize

    # normalize unicode
    clean = normalize('NFD', text).encode('ascii', 'ignore')
    clean = clean.decode('utf-8')

    # remove punctuation
    number_handler = re.compile(r'(?<=\d),(?=\d)')
    punct_re = re.compile('[{}]'.format(re.escape('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-')))

    abreviation = re.compile('[^a-zA-Z0-9-_.]')
    clean = abreviation.sub(' ', clean)

    clean = number_handler.sub('',clean)
    clean = punct_re.sub(' ', clean)

    # remove any double whitespace
    clean = ' '.join(clean.split())

    return clean


In [2]:
import os
from nltk.tokenize import word_tokenize

In [5]:
path  = "./review_polarity/txt_sentoken/"
pos_files = sorted(os.listdir(path + "pos/"), reverse=True)
neg_files = sorted(os.listdir(path + "neg/"), reverse=True)


print(pos_files)
print(neg_files)

['cv999_13106.txt', 'cv998_14111.txt', 'cv997_5046.txt', 'cv996_11592.txt', 'cv995_21821.txt', 'cv994_12270.txt', 'cv993_29737.txt', 'cv992_11962.txt', 'cv991_18645.txt', 'cv990_11591.txt', 'cv989_15824.txt', 'cv988_18740.txt', 'cv987_6965.txt', 'cv986_13527.txt', 'cv985_6359.txt', 'cv984_12767.txt', 'cv983_22928.txt', 'cv982_21103.txt', 'cv981_14989.txt', 'cv980_10953.txt', 'cv979_18921.txt', 'cv978_20929.txt', 'cv977_4938.txt', 'cv976_10267.txt', 'cv975_10981.txt', 'cv974_22941.txt', 'cv973_10066.txt', 'cv972_26417.txt', 'cv971_10874.txt', 'cv970_18450.txt', 'cv969_13250.txt', 'cv968_24218.txt', 'cv967_5788.txt', 'cv966_28832.txt', 'cv965_26071.txt', 'cv964_6021.txt', 'cv963_6895.txt', 'cv962_9803.txt', 'cv961_5682.txt', 'cv960_29007.txt', 'cv959_14611.txt', 'cv958_12162.txt', 'cv957_8737.txt', 'cv956_11609.txt', 'cv955_25001.txt', 'cv954_18628.txt', 'cv953_6836.txt', 'cv952_25240.txt', 'cv951_10926.txt', 'cv950_12350.txt', 'cv949_20112.txt', 'cv948_24606.txt', 'cv947_10601.txt', 'cv

In [6]:
text_data = []
for file in pos_files:
    with open(path + "pos/"+file, 'r') as f:
        text = f.read()
        text = clean_text(text)
        words = word_tokenize(text)
        text_data.append(words)
print(text_data[:5])

[['truman', 'true', 'man', 'burbank', 'is', 'the', 'perfect', 'name', 'for', 'jim', 'carrey', 's', 'character', 'in', 'this', 'film', 'president', 'truman', 'was', 'an', 'unassuming', 'man', 'who', 'became', 'known', 'worldwide', 'in', 'spite', 'of', 'or', 'was', 'it', 'because', 'of', 'his', 'stature', 'truman', 'also', 'recalls', 'an', 'era', 'of', 'plenty', 'following', 'a', 'grim', 'war', 'an', 'era', 'when', 'planned', 'communities', 'built', 'by', 'government', 'scientists', 'promised', 'an', 'idyllic', 'life', 'for', 'americans', 'and', 'burbank', 'california', 'brings', 'to', 'mind', 'the', 'tonight', 'show', 'and', 'the', 'home', 'of', 'nbc', 'if', 'hollywood', 'is', 'the', 'center', 'of', 'the', 'film', 'world', 'burbank', 'is', 'or', 'was', 'the', 'center', 'of', 'tv', 's', 'world', 'the', 'world', 'where', 'our', 'protagonist', 'lives', 'combine', 'all', 'these', 'names', 'and', 'concepts', 'into', 'truman', 'burbank', 'and', 'you', 'get', 'something', 'that', 'well', 'desc

## Let's make it a function 

In [8]:
""" PASS THE FOLDER PATH """

def parse_folder(path):
    import os
    import nltk
    from nltk.tokenize import word_tokenize
    nltk.download('punkt')
    text_data = []
    files = sorted(os.listdir(path), reverse=True)
    for file in files:
        with open(path + file, 'r') as f:
          # read text file
          text = f.read()
          # clean text data
          text = clean_text(text)
          # tokenize text
          words = word_tokenize(text)
          # append to data
          text_data.append(words)
    return text_data

In [9]:
pos_rev = parse_folder(path=path + "pos/")

neg_rev = parse_folder(path=path + "neg/")

[nltk_data] Downloading package punkt to /home/gm0234/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/gm0234/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
neg_rev[0]

['two',
 'party',
 'guys',
 'bob',
 'their',
 'heads',
 'to',
 'haddaway',
 's',
 'dance',
 'hit',
 'what',
 'is',
 'love',
 'while',
 'getting',
 'themselves',
 'into',
 'trouble',
 'in',
 'nightclub',
 'after',
 'nightclub',
 'it',
 's',
 'barely',
 'enough',
 'to',
 'sustain',
 'a',
 'three',
 'minute',
 'saturday',
 'night',
 'live',
 'skit',
 'but',
 'snl',
 'producer',
 'lorne',
 'michaels',
 'clueless',
 'creator',
 'amy',
 'heckerling',
 'and',
 'paramount',
 'pictures',
 'saw',
 'something',
 'in',
 'the',
 'late',
 'night',
 'television',
 'institution',
 's',
 'recurring',
 'roxbury',
 'guys',
 'sketch',
 'that',
 'would',
 'presumably',
 'make',
 'a',
 'good',
 'feature',
 'emphasis',
 'on',
 'the',
 'word',
 'presumably',
 'a',
 'night',
 'at',
 'the',
 'roxbury',
 'takes',
 'an',
 'already',
 'thin',
 'concept',
 'and',
 'tediously',
 'stretches',
 'it',
 'far',
 'beyond',
 'the',
 'breaking',
 'point',
 'and',
 'that',
 'of',
 'viewers',
 'patience',
 'levels',
 'the',
 