In [None]:
import os
import re
import pandas as pd
import nltk
import joblib
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adamax

# ============================
# Configuration & Setup
# ============================

tf.config.run_functions_eagerly(True)

data_path = "/Users/darshan__6122__/data_cleaning_project/Parth/AmazonReviews/train.csv"
log_file = "model_comparison_log.csv"

if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {data_path}.")

print("Dataset located. Loading...")

# ============================
# Load and Preprocess Dataset
# ============================

data = pd.read_csv(
    data_path,
    names=["polarity", "title", "text"],
    dtype={"polarity": "int8", "title": "string", "text": "string"},
    usecols=["polarity", "text"],
    low_memory=False
)

data["polarity"] = data["polarity"].map({1: 0, 2: 1}).astype("int8")

nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9',.!?]", " ", text)
    return text.lower()

data["text"] = data["text"].astype(str).map(clean_text)

# ============================
# Feature Extraction
# ============================

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["polarity"], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

vocab_size = 5000
max_length = 80

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding="post")
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding="post")

# ============================
# Model Training & Loading
# ============================

models = {}

# Logistic Regression
lr_model_path = "logistic_regression.pkl"
if os.path.exists(lr_model_path):
    print("Loading Logistic Regression model...")
    lr_model = joblib.load(lr_model_path)
    lr_model.fit(X_train_tfidf, y_train)
else:
    print("Training Logistic Regression model...")
    lr_model = LogisticRegression()
    lr_model.fit(X_train_tfidf, y_train)
    joblib.dump(lr_model, lr_model_path)
models["Logistic Regression"] = lr_model

# LSTM
lstm_model_path = "lstm_model.h5"
if os.path.exists(lstm_model_path):
    print("Loading LSTM model...")
    lstm_model = load_model(lstm_model_path)
    lstm_model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
    lstm_model.fit(X_train_padded, y_train, epochs=3, batch_size=32, validation_data=(X_test_padded, y_test), verbose=1)
else:
    print("Training LSTM model...")
    lstm_model = Sequential([
        Embedding(vocab_size, 32, input_length=max_length),
        Bidirectional(LSTM(32)),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    lstm_model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
    lstm_model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test), verbose=1)
    lstm_model.save(lstm_model_path)
models["LSTM"] = lstm_model

# CNN+LSTM
cnn_lstm_model_path = "cnn_lstm_model.h5"
if os.path.exists(cnn_lstm_model_path):
    print("Loading CNN+LSTM model...")
    cnn_lstm_model = load_model(cnn_lstm_model_path)
    cnn_lstm_model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
    cnn_lstm_model.fit(X_train_padded, y_train, epochs=3, batch_size=32, validation_data=(X_test_padded, y_test), verbose=1)
else:
    print("Training CNN+LSTM model...")
    cnn_lstm_model = Sequential([
        Embedding(vocab_size, 32, input_length=max_length),
        Conv1D(64, kernel_size=3, activation="relu"),
        MaxPooling1D(pool_size=2),
        Bidirectional(LSTM(32)),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    cnn_lstm_model.compile(loss="binary_crossentropy", optimizer=Adamax(learning_rate=0.0001), metrics=["accuracy"])
    cnn_lstm_model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test), verbose=1)
    cnn_lstm_model.save(cnn_lstm_model_path)
models["CNN+LSTM"] = cnn_lstm_model

# ============================
# Evaluation and Logging
# ============================

results = []
dataset_name = os.path.basename(data_path)

for name, model in models.items():
    print(f"\nEvaluating {name}...")

    if name == "Logistic Regression":
        y_pred = model.predict(X_test_tfidf)
    else:
        y_pred = (model.predict(X_test_padded) > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name} Results — Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    
    results.append({
        "Model": name,
        "Dataset": dataset_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })

df_results = pd.DataFrame(results)
df_results.to_csv(log_file, mode='a', header=not os.path.exists(log_file), index=False)

print(f"\nModel comparison results saved to '{log_file}'.")
