In [None]:
import os
import re
import pandas as pd
import nltk
import joblib
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ============================
# Setup and Configuration
# ============================

folder_path = "/Users/darshan__6122__/data_cleaning_project/Parth/AmazonReviews/"
log_file = "gradient_boosting_log.csv"
gb_model_path = "gradient_boosting.pkl"
vectorizer_path = "vectorizer.pkl"

nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# ============================
# Text Preprocessing
# ============================

def clean_text(text):
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9',.!?]", " ", text)
    return text.lower().strip()

def extract_number(filename):
    match = re.search(r"(\d+)", filename)
    return int(match.group(1)) if match else float("inf")

# ============================
# Process Dataset Files
# ============================

csv_files = sorted(
    [f for f in os.listdir(folder_path) if f.endswith(".csv")],
    key=extract_number
)

for filename in csv_files:
    data_path = os.path.join(folder_path, filename)
    print(f"\nProcessing file: {filename}...")

    # Load Dataset
    data = pd.read_csv(
        data_path,
        names=["polarity", "title", "text"],
        dtype={"polarity": "int8", "title": "string", "text": "string"},
        usecols=["polarity", "text"],
        low_memory=False
    )

    # Convert labels to binary
    data["polarity"] = data["polarity"].map({1: 0, 2: 1}).astype("int8")

    num_positive = (data["polarity"] == 1).sum()
    num_negative = (data["polarity"] == 0).sum()

    data["text"] = data["text"].astype(str).map(clean_text)

    X_train, X_test, y_train, y_test = train_test_split(
        data["text"], data["polarity"], test_size=0.2, random_state=42
    )

    # TF-IDF Vectorization
    if os.path.exists(vectorizer_path):
        print("Loading pre-trained TF-IDF vectorizer...")
        vectorizer = joblib.load(vectorizer_path)
    else:
        print("Training new TF-IDF vectorizer...")
        vectorizer = TfidfVectorizer(max_features=5000)
        vectorizer.fit(X_train)
        joblib.dump(vectorizer, vectorizer_path)

    X_train_tfidf = vectorizer.transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train or Load Gradient Boosting Model
    if os.path.exists(gb_model_path):
        print("Loading existing Gradient Boosting model...")
        gb_model = joblib.load(gb_model_path)
        gb_model.fit(X_train_tfidf, y_train)
    else:
        print("Training Gradient Boosting model from scratch...")
        gb_model = GradientBoostingClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
        )
        gb_model.fit(X_train_tfidf, y_train)

    # Save Model
    joblib.dump(gb_model, gb_model_path)

    # Evaluate Model
    print("Evaluating Gradient Boosting model...")
    y_pred = gb_model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Positive Reviews: {num_positive} | Negative Reviews: {num_negative}")
    print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f}")

    # Log Results
    log_entry = pd.DataFrame([{
        "Model": "Gradient Boosting",
        "Dataset": filename,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Positive Reviews": num_positive,
        "Negative Reviews": num_negative
    }])

    log_entry.to_csv(log_file, mode='a', header=not os.path.exists(log_file), index=False)
    print(f"Performance metrics logged in '{log_file}'")

print("\nAll datasets processed successfully.")



📂 Processing file: output_1.csv...
✅ Loading pre-trained TF-IDF vectorizer...
🚀 Training Gradient Boosting model from scratch...
🔍 Evaluating Gradient Boosting model...

📊 Model performance logged for `output_1.csv` in `gradient_boosting_log.csv`!

📂 Processing file: output_2.csv...
✅ Loading pre-trained TF-IDF vectorizer...
✅ Loading pre-trained Gradient Boosting model...
🔍 Evaluating Gradient Boosting model...

📊 Model performance logged for `output_2.csv` in `gradient_boosting_log.csv`!

📂 Processing file: output_3.csv...
✅ Loading pre-trained TF-IDF vectorizer...
✅ Loading pre-trained Gradient Boosting model...
🔍 Evaluating Gradient Boosting model...

📊 Model performance logged for `output_3.csv` in `gradient_boosting_log.csv`!

📂 Processing file: output_4.csv...
✅ Loading pre-trained TF-IDF vectorizer...
✅ Loading pre-trained Gradient Boosting model...
🔍 Evaluating Gradient Boosting model...

📊 Model performance logged for `output_4.csv` in `gradient_boosting_log.csv`!

📂 Process