In [1]:
import re
import string
from argparse import Namespace
from bs4 import BeautifulSoup
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

In [2]:
## Define Config
args = Namespace(
    raw_file_path = './data/raw_data/unlabeled.tsv',
    processed_file_path = './data/clean_data/word2vec_training_corpus.txt'
)

In [3]:
reviews = []

with open(args.raw_file_path) as f:
    count = 0
    for i,line in enumerate(f.readlines()):
        if i==0:
            continue
        else:
            id_review = line.split('\t')
            
            if len(id_review)==2:
                _id, review = id_review
                reviews.append(review)
                
print(f"Number of Reviews : {len(reviews)}")
print(f"\nFirst Original Review :\n\n{reviews[0]}")

Number of Reviews : 50000

First Original Review :

"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let's pool our money together and make a really bad movie!\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film's release. Life's like that."



In [4]:
tokenizer = TreebankWordTokenizer()
stop_words = stopwords.words('english')
remove_words = string.punctuation + '0123456789'

In [5]:
## Clean the data 
def process_review(review, verbose = False):
    
    if verbose:
        print(f"Original Review :\n\n{review}")
        
    ## Convert to string
    review = eval(review)
    
    ## Remove html tags
    clean_review = BeautifulSoup(review).get_text()
    
    ## Remove everything except words
    clean_review = re.sub('^\w+','',review)
    
    ## Convert to lower case and remove stopwords
    clean_review = clean_review.lower().strip()
    
    ## Convert to tokens 
    tokens = tokenizer.tokenize(clean_review)
    
    ## Filter tokens
    tokens = [w for w in tokens if w not in stop_words and w not in remove_words and w.isalpha()]
    
    if verbose:
        print(f"Processed Review :\n\n{' '.join(tokens)}")
        
    return tokens

In [6]:
tokens = process_review(reviews[0], True)

Original Review :

"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let's pool our money together and make a really bad movie!\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film's release. Life's like that."

Processed Review :

time chasers obvious made bunch maybe sitting around one day film school said hey let pool money together make really bad movie something like ever said still ended making really bad movie dull story bad script lame acting poor cinematography bottom barrel stock music corners cut except one would prevented film life like


In [7]:
## Processed Rview
clean_reviews = list(map(process_review,reviews))
print(clean_reviews[0])

['time', 'chasers', 'obvious', 'made', 'bunch', 'maybe', 'sitting', 'around', 'one', 'day', 'film', 'school', 'said', 'hey', 'let', 'pool', 'money', 'together', 'make', 'really', 'bad', 'movie', 'something', 'like', 'ever', 'said', 'still', 'ended', 'making', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'barrel', 'stock', 'music', 'corners', 'cut', 'except', 'one', 'would', 'prevented', 'film', 'life', 'like']


In [8]:
## Save the data
## Instaed of wrapping str we can use dump function
with open(args.processed_file_path,'w') as f:
    for line in clean_reviews:
        f.write(str(line))
        f.write('\n')