In [1]:
import pandas as pd
import string

In [2]:
df = pd.read_csv('data/raw_reviews.csv')
df.drop(columns=['Unnamed: 0'], inplace = True)

In [5]:
reviews = df['Text'].tolist()

In [16]:
len(reviews)

2459

## Data Preparation

In [7]:
def cleanin_reviews(review):
    # Many reviews have \n characters through my scrape, so I need to replace those with empty spaces
    review = review.replace('\n\n', ' ')
    
    # Splitting on spaces to tokenize
    tokens = review.split()
    
    # Getting rid of punctuation because 'word.' will be considered different from 'word'
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    
    # Simplifying my vocabulary to only include words and not smiley faces/other datatypes
    tokens = [word for word in tokens if word.isalpha()]
    
    tokens = [word.lower() for word in tokens]
    return tokens

In [8]:
def tokenizer(data):
    all_tokens = []

    for review in data:
        all_tokens.append(cleanin_reviews(review))
    
    return all_tokens

In [9]:
def flatten(tokens):
    flat_list = [item for sublist in tokens for item in sublist]
    return flat_list

In [10]:
tokenized_reviews = tokenizer(reviews)

In [11]:
# Save cleaned tokenized reviews
with open("data/clean_reviews.txt", "w") as out:
    for review in tokenized_reviews:
        out.write(" ".join(review) + "\n")

# Save cleaned tokenized reviews -> sampled
with open("data/clean_reviews_sample.txt", "w") as out:
    for review in tokenized_reviews[:100]:
        out.write(" ".join(review) + "\n")

In [12]:
flatten_reviews = flatten(tokenized_reviews)
print('Total Tokens: %d' % len(flatten_reviews))
print('Unique Tokens: %d' % len(set(flatten_reviews)))

Total Tokens: 670280
Unique Tokens: 37641


In [13]:
# Generate word sequence from data
def sequence_of_tokens(stars):
    length = 5 + 1
    sequences = list()
    for i in range(length, len(stars)):
        # select sequence of tokens
        seq = stars[i-length:i]
        # convert into a line
        line = ' '.join(seq)
        # store
        sequences.append(line)
    print('Total Sequences: %d' % len(sequences))
    return sequences

In [14]:
sequence_reviews = sequence_of_tokens(flatten_reviews)

Total Sequences: 670274


In [15]:
# Save sequence file
with open("data/sequence_reviews.txt", "w") as out:
    for review in sequence_reviews:
        out.write(review + "\n")