# Loading the Data

In [1]:
from os import listdir
from nltk.corpus import stopwords 
import string
import re

In [2]:
def load_docs(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    
    return text

In [3]:
# def process_docs(directory):
#     for filename in listdir(directory):
#         if not filename.endswith('.txt'):
#             next
#         path = directory+'/'+filename
#         #text = load_docs(path)
#         #print("Loaded : ", filename)

In [4]:
# directory = 'txt_sentoken/neg'
# process_docs(directory)

# Cleaning the Data

We will initially clean one file, and then expand it to include all the files

In [5]:
def clean_docs(doc):
   
    # split data into tokens
    words = doc.split()
    
    # load the punctuations to be removed
    re_compile = re.compile('[%s]'%re.escape(string.punctuation))
    
    # remove the punctuations from each word from the list of words
    cleaned_text = [re_compile.sub('',w) for w in words]
    
    # remove all stop words from the text
    
    stop_words = stopwords.words('English')
    cleaned_text = [word for word in cleaned_text if word not in stop_words]
    
    # remove all numbers or other characters which are not letters
    
    cleaned_text = [word for word in cleaned_text if word.isalpha()]
    
    # remove all characters or words with length <= 1
    
    cleaned_text = [word for word in cleaned_text if len(word) > 1]
    
    return cleaned_text

In [6]:
filename = 'txt_sentoken/neg/cv000_29416.txt'
 # get the data corresponding to each file
text = load_docs(filename)
cleaned_text = clean_docs(text)

In [7]:
print(cleaned_text)

['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'whats', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mindfuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'didnt', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'whats', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scen

# Develop Vocabulary

In [8]:
from collections import Counter

In [9]:
def add_doc_to_vocab(filename, vocab):
    
    # after receving the filename , load the file from the list
    text = load_docs(filename)
    
    # clean & return the loaded file
    cleaned_text = clean_docs(text)
    
    # add all the cleaned words to the dictionary
    vocabulary.update(cleaned_text)

In [10]:
def process_docs(directory,vocabulary):
    # iterate through all the files in the directory
    for filename in listdir(directory):
        if not filename.endswith('.txt'):
            next
        # add path to the file
        path = directory+'/'+filename
        
        # pass the specific file to the below function to add the cleaned words in the file
        # to the dictionary
        
        add_doc_to_vocab(path, vocabulary)
        

In [11]:
vocabulary = Counter()
negative_reviews = 'txt_sentoken/neg'
positive_reviews = 'txt_sentoken/pos'

reviews = [negative_reviews, positive_reviews]
process_reviews = [ process_docs(review_directory,vocabulary)  for review_directory in reviews]

print(len(vocabulary))

46557


In [12]:
print(vocabulary)



In [13]:
def process_vocabulary(vocabulary, min_occurences):

    tokens = [word for word,count in vocabulary.items() if count > min_occurences]
    print(len(tokens))
    return tokens
    

In [14]:
tokens = process_vocabulary(vocabulary, 5)

13058


In [15]:
def save_list(word_list, filename):
    data = '\n'.join(word_list)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [16]:
save_list(tokens, "vocabulary_9.txt")

# Save Prepared Data

 - We will load a  file  and process ( bascially clean and matching agains the vocabulary) all the words in that file, to form a list
 - For all such files, will form a master list which will append the list for each document

In [17]:
def doc_to_line(filename, vocabulary):
    text = load_docs(filename)
    
    cleaned_text = clean_docs(text)
    
    tokens = [w for w in cleaned_text if w in vocabulary]
    
    return ' '.join(tokens)

### Modified Process Docs

In [18]:
def process_docs_2(directory,vocabulary):
    # iterate through all the files in the directory
    lines = []
    for filename in listdir(directory):
        if not filename.endswith('.txt'):
            next
        # add path to the file
        path = directory+'/'+filename
        
        # pass the specific file to the below function so that the cleaned words can be added as a list,
        # so bascially each review will be a list appended to the master list
        
        line = doc_to_line(path, vocabulary)
        lines.append(line)
    return lines
        

In [19]:
filename = 'vocabulary_9.txt'
vocabulary = load_docs(filename)
vocabulary = vocabulary.split()
vocabulary = set(vocabulary)

In [20]:
negative_lines = process_docs_2('txt_sentoken/neg', vocabulary) 
save_list(negative_lines, 'negative.txt')
# prepare positive reviews
positive_lines = process_docs_2('txt_sentoken/pos', vocabulary) 
save_list(positive_lines, 'positive.txt')

### Note : The following Steps are taken to prepare the dataset

 - Basically we first use the functions to create a vocabulary

 - Then we use new set of reviews ( if any ) to process them file by file and make two separate files

 - Note that the words in the new reviews are cleaned, and only those words are kept which were in the vacubulary list

 - So for each new file in the new reviews directory, we process the file , which includes cleaning and checking agains the vocab list and then we return a list of words in that file which meet this criteria, 
 
 - Next we do this for all the files in that directory and append the list to a master list
 
 - Finally we save the master list for an entire directory in a file