In [15]:
# Imports
import numpy as np
import pandas as pd
from num2words import num2words  # Converting numbers to word form
import ijson  # Parser for large json files
import csv

import nltk  # Import nltk module
from nltk.corpus import stopwords  # Stop word dictionary
from nltk.stem import PorterStemmer  # Stems words
from nltk.tokenize import word_tokenize  # Tokenizer

# Download stopwords dictionary
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/eitan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
file_name = "yelp_academic_dataset_review.json"

# Example of a json object in the review file:
# {"review_id":"btHrA_nXUceLqZRvymvXng","user_id":"amaOELCfgLup2MwE2j8PfA","business_id":"_v3DcLatG70adfYzWTd-CQ","stars":5.0,"useful":2,"funny":2,"cool":1,"text":"I love this store!","date":"2015-03-18 21:09:07"}

In [17]:
# Word cleaning/preprocessing function
def text_preprocess(text):
    '''
    Function: Process takes an incoming JSON text review and preprocesses it by:
    1. converting string to lowercase
    2. removing stop words (is, it, a, but etc)
    3. converts numbers to their word form (1 -> one) NOTE: Unclear if this is neccessary, but was suggsted online
    4. Stem the words (convert words like programming -> program)
    
    Input: JSON item "review"
    Output: preprocessed string
    '''
    # Define array to store processed words:
    processed_words = []

    # Define stop_words dict using nltk package
    stop_words = set(stopwords.words('english'))

    # Split string into list
    split = text.split()
    
    # Create stemmer object
    ps = PorterStemmer()

    # Iterate through each word
    for word in split:
        # Convert word to lowercase
        word = word.lower()

        # Check if word is a stop word before proceeding
        if word in stop_words:
            continue # Continue will skip current iteration (current word) if it is a stop word
        
        if word == 'infinity': # Edgecase check if word is infinity, as num2words does not handle it correctly
            processed_words.append('infinity')
            continue # Continue to next word

        # Convert numbers to word form 
        try:
            # Check if word is a number
            float(word)  # This will raise ValueError if the word is not a number
            word = num2words(word)
        except ValueError:
            # If it's not a number, we can stem the word
            word = ps.stem(word)
        
        processed_words.append(word)

    # Join the processed words to form a clean text
    clean_text = ' '.join(processed_words)

    return clean_text


In [18]:

# Open the JSON file and CSV file
input_file = file_name  # Replace with your actual file path
output_file = 'processed_reviews.csv'

with open(input_file, 'r', encoding='utf-8') as json_file, open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write the CSV header
    csv_writer.writerow(['review_id', 'user_id', 'business_id', 'stars', 'text', 'date'])
    
    # Parse each line in the JSON file as a separate JSON object
    for line in json_file:
        review = eval(line.strip())  # Convert string to dictionary
        if isinstance(review, dict):  # Ensure it's a valid JSON object
            # Extract all fields from the JSON object

            # NOTE: I commented out features we aren't using at this time. Only stars and text remain (eitan)
            
            # review_id = review['review_id']
            # user_id = review['user_id']
            # business_id = review['business_id']
            stars = review['stars']
            text = review['text']
            # date = review['date']
            # useful = review['useful']
            # funny = review['funny']
            # cool = review['cool']
            processed_text = text_preprocess(text)
            
            # Write the data to the CSV file
            csv_writer.writerow([stars, processed_text])


KeyboardInterrupt: 