# Presprocessing the Amazon Reviews Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import gzip
import json
import os
import string
import nltk

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Remove punctuation from the text
def remove_punctuation(text):
  punctuationfree = "".join([i for i in text if i not in string.punctuation])
  return punctuationfree

# Remove stopwords
# Slice until 63, keep all negative modifiers
stopwords = nltk.corpus.stopwords.words('english')[:63]

def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def process_text(rating, text):
  if rating < 3:
    sentiment = 0
  elif rating < 4:
    sentiment = 1
  else:
    sentiment = 2

  text = remove_punctuation(text).lower().replace('\n', ' ')
  tokens = nltk.word_tokenize(text)
  tokens = remove_stopwords(tokens)

  wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
  lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

  return (lemmatized_text, sentiment)


Now we import the data from Google Drive.

In [5]:
folder = '/content/drive/MyDrive/CS 171/Final Project'
filename = 'Office_Products_5.json.gz'
cleaned_filename = 'Office_Products_Clean.json'

With all utility functions defined, we can now preprocess the data.

In [6]:
dataset = []

with gzip.open(os.path.join(folder, filename), 'r') as infile:
    for line in infile.readlines():
      review = json.loads(line)

      try:
        text, sentiment = process_text(review['overall'], review['reviewText'])
      except KeyError:
        # Skip reviews with no review text
        pass

      if len(text) != 0:
        dataset.append((text, sentiment))

Now, we re-export the data as another json.

In [7]:
with open(os.path.join(folder, cleaned_filename), 'w') as outfile:
  for text, sentiment in dataset:
    entry_dict = {'text': text, 'sentiment': sentiment}
    json.dump(entry_dict, outfile)
    outfile.write('\n')