In [27]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import json 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/chi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Data Cleaning

In [19]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))

    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Remove all non-word characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Normalize whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords and apply stemming
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Re-create text from filtered tokens
    text = ' '.join(filtered_tokens)
    return text

In [20]:
file_path = "data/IMDB_reviews.json"

data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [21]:
tqdm.pandas(desc="Cleaning Text")

# Apply the cleaning function with a progress bar
df['cleaned_review_text'] = df['review_text'].progress_apply(clean_text)

Cleaning Text: 100%|██████████| 573913/573913 [11:31<00:00, 829.70it/s] 


In [22]:
json_file_path = 'data/preprocessed.json'  # Change this to your desired file path

# Save the DataFrame to a JSON file
df.to_json(json_file_path, orient='records', lines=True)

print(f"DataFrame saved successfully to {json_file_path}.")

DataFrame saved successfully to data/preprocessed.json.


## Data Preprocessing

In [32]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['is_spoiler'])

# Split the data before applying SMOTE
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Calculate original class distribution
original_class_distribution = train_df['label'].value_counts()



# Vectorization and SMOTE
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_df['review_text'])  # Convert text to TF-IDF
y_train = train_df['label']



smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Check the resampled data
print("Resampled dataset shape:", X_resampled.shape)

train_df_resampled = pd.DataFrame(X_resampled.todense(), columns=vectorizer.get_feature_names_out())
train_df_resampled['label'] = y_resampled

# 5. Re-split the temporary set into validation and test sets (50% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print("Train (Resampled) Shape:", train_df_resampled.shape)
print("Validation Shape:", val_df.shape)
print("Test Shape:", test_df.shape)
# Tokenization
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Resampled dataset shape: (676782, 256335)


: 