In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import json 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
import random

from transformers import BertTokenizer
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /Users/chi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Data Cleaning

In [19]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))

    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Remove all non-word characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Normalize whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords and apply stemming
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Re-create text from filtered tokens
    text = ' '.join(filtered_tokens)
    return text

In [2]:
file_path = "data/IMDB_reviews.json"
cleaned_file_path = 'data/preprocessed.json'  # Change this to your desired file path
sampled_file_path = 'data/sampled_preprocessed.json' 

In [None]:
 # Change this to your desired file path

data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [21]:
tqdm.pandas(desc="Cleaning Text")

# Apply the cleaning function with a progress bar
df['cleaned_review_text'] = df['review_text'].progress_apply(clean_text)

Cleaning Text: 100%|██████████| 573913/573913 [11:31<00:00, 829.70it/s] 


In [None]:

# Save the DataFrame to a JSON file
df.to_json(cleaned_file_path, orient='records', lines=True)

print(f"DataFrame saved successfully to {cleaned_file_path}.")

DataFrame saved successfully to data/preprocessed.json.


In [4]:
# sampling 1/20 of the original cleaned data, to speed up the workflow

sampling_ratio = 0.05 # 5% of the data

with open(cleaned_file_path, "r", encoding="utf-8") as f:
    sampled_data = []
    for line in f:  # Reads line by line
        if random.random() < sampling_ratio:
            sampled_data.append(json.loads(line)) 

with open(sampled_file_path, "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, indent = 1)

print(f"Sampled {len(sampled_data)} records and saved to {sampled_file_path}.")

Sampled 28583 records and saved to data/sampled_preprocessed.json.


## Data Preprocessing