In [None]:
import os
import re
import pandas as pd
import nltk
import joblib
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Masking
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adamax

# ============================
# Configuration & Setup
# ============================

tf.config.experimental.enable_mlir_graph_optimization = True
tf.keras.mixed_precision.set_global_policy("mixed_float16")

folder_path = "/Users/darshan__6122__/data_cleaning_project/Parth/original_data/"
log_file = "rnn_model_log.csv"
rnn_model_path = "rnn_model.h5"
tokenizer_path = "tokenizer.pkl"

nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# ============================
# Utility Functions
# ============================

def clean_text(text):
    """Clean and normalize text by removing URLs and unwanted characters."""
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9',.!?]", " ", text)
    return text.lower().strip()

def build_rnn_model(vocab_size, max_length):
    """Define and compile the RNN model."""
    model = Sequential([
        Embedding(vocab_size, 32, input_length=max_length, dtype="float16"),
        Masking(mask_value=0),
        SimpleRNN(64, activation="relu", dtype="float16"),
        Dropout(0.3),
        Dense(32, activation="relu", dtype="float16"),
        Dense(1, activation="sigmoid", dtype="float16")
    ])
    model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
    return model

# ============================
# Main Processing Loop
# ============================

for filename in sorted(os.listdir(folder_path)):
    if not filename.endswith(".csv"):
        continue

    data_path = os.path.join(folder_path, filename)
    print(f"\nProcessing file: {filename}...")

    # Load and preprocess dataset
    data = pd.read_csv(
        data_path,
        names=["polarity", "title", "text"],
        dtype={"polarity": "int8", "title": "string", "text": "string"},
        usecols=["polarity", "text"],
        low_memory=False
    )

    data["polarity"] = data["polarity"].map({1: 0, 2: 1}).astype("int8")
    num_positive = (data["polarity"] == 1).sum()
    num_negative = (data["polarity"] == 0).sum()

    data["text"] = data["text"].astype(str).map(clean_text)

    X_train, X_test, y_train, y_test = train_test_split(
        data["text"], data["polarity"], test_size=0.2, random_state=42
    )

    # Tokenization and Padding
    vocab_size = 5000
    max_length = 80
    batch_size = 64

    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding="post", truncating="post")
    X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding="post", truncating="post")

    joblib.dump(tokenizer, tokenizer_path)

    # Train or load the RNN model
    if os.path.exists(rnn_model_path):
        print("Loading existing RNN model...")
        rnn_model = load_model(rnn_model_path)
        rnn_model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
        rnn_model.fit(X_train_padded, y_train, epochs=3, batch_size=batch_size, validation_data=(X_test_padded, y_test), verbose=1)
    else:
        print("Training new RNN model...")
        rnn_model = build_rnn_model(vocab_size, max_length)
        rnn_model.fit(X_train_padded, y_train, epochs=5, batch_size=batch_size, validation_data=(X_test_padded, y_test), verbose=1)

    rnn_model.save(rnn_model_path)

    # Evaluate and log performance
    print("Evaluating model performance...")
    y_pred = (rnn_model.predict(X_test_padded, batch_size=batch_size) > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-score : {f1:.4f}")

    log_entry = pd.DataFrame([{
        "Model": "RNN",
        "Dataset": filename,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Positive Reviews": num_positive,
        "Negative Reviews": num_negative
    }])

    log_entry.to_csv(log_file, mode='a', header=not os.path.exists(log_file), index=False)
    print(f"Performance logged in '{log_file}'")

print("\nAll datasets processed successfully.")
