In [0]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
root_path = '/content/drive/My Drive/DSCI 303 Final Project/Data_Trimming_and_TFIDF_Models/'

In [0]:
with open(root_path + 'trimmed_review_500k.json') as json_file: # Open json file to load
    data = json.load(json_file) # Load the data as a big dictionary

In [0]:
df = pd.DataFrame(data['Review:Rating']) # Convert list of dictionaries {review, stars} to dataframe

In [0]:
def convert_lower_case(data):
    return np.char.lower(data)

In [14]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stop_words(data):
  """
  Input: str
  Output: str
  Removes stop words like I, me, the, etc. For preprocessing the data
  """
  stop_words = stopwords.words('english')
  words = word_tokenize(str(data))
  new_text = ""
  for w in words:
      if w not in stop_words and len(w) > 1:
          new_text = new_text + " " + w
  return new_text

In [0]:
def remove_punctuation(data):
  """
  Input: str
  Output: str
  Further preprocessing
  """
  symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
  for i in range(len(symbols)):
      data = np.char.replace(data, symbols[i], ' ')
      data = np.char.replace(data, "  ", " ")
  data = np.char.replace(data, ',', '')
  return data

In [0]:
def remove_apostrophe(data):
  """
  Input: str
  Output: str
  Further preprocessing
  """
  return np.char.replace(data, "'", "")

In [0]:
from nltk.stem import PorterStemmer
def stemming(data):
  """
  Input: str
  Output: str
  Converts words to their stem. Ex: worked -> work. Removes suffix and affix. No need for lemmatization for TFIDF
  """
  stemmer= PorterStemmer()
  
  tokens = word_tokenize(str(data))
  new_text = ""
  for w in tokens:
      new_text = new_text + " " + stemmer.stem(w)
  return new_text

In [19]:
!pip install num2words



In [0]:
from num2words import num2words
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

**Pipeline** below for preprocessing our text in our dataframe using the functions above:

In [0]:
df['review'] = df['review'].apply(lambda x: convert_lower_case(x)) #Convert each review to lowercase

In [0]:
df['review'] = df['review'].apply(lambda x: remove_punctuation(x)) #Remove punctuation from each review

In [0]:
df['review'] = df['review'].apply(lambda x: remove_apostrophe(x)) #Remove apostrophes from each review

In [0]:
df['review'] = df['review'].apply(lambda x: remove_stop_words(x)) #Remove stop words from each review

In [0]:
df['review'] = df['review'].apply(lambda x: convert_numbers(x)) #Convert numerics to string equivalents

In [0]:
df['review'] = df['review'].apply(lambda x: stemming(x)) #Stem all the words from each review

In [0]:
df['review'] = df['review'].apply(lambda x: remove_punctuation(x)) #Repeated just in case punctuation was reintroduced

In [0]:
df['review'] = df['review'].apply(lambda x: convert_numbers(x)) #Just in case more numbers were reintroduced

In [0]:
df['review'] = df['review'].apply(lambda x: stemming(x)) #Just in case numbers needed to be stemmed again 

In [0]:
df['review'] = df['review'].apply(lambda x: remove_punctuation(x)) #Repeated because num2words does give some hyphens and commas

In [0]:
df['review'] = df['review'].apply(lambda x: remove_stop_words(x)) #Repeated because num2words does give stop words

In [32]:
df.head(6)

Unnamed: 0,review,stars
0,total bill horribl servic 8g crook actual ner...,1.0
1,ador travi hard rock new kelli cardena salon ...,5.0
2,say offic realli togeth organ friendli dr phi...,5.0
3,went lunch steak sandwich delici caesar salad...,5.0
4,today second three session paid although firs...,1.0
5,ill first admit excit go la tavolta food snob...,4.0


In [0]:
data = df.to_json(orient='records')

In [0]:
with open(root_path + "preprocessed_500k_reviews.json", 'w') as outfile:
    json.dump(data, outfile)