In [1]:
import pandas as pd
import os
import pickle
import joblib
import logging
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, accuracy
from surprise.model_selection import train_test_split

In [None]:
data_path = "data/processed/reviews.csv"
output_path = "models/mf_model.pkl"
algorithm = "SVD"
custom_params = None
teencode_path = "resources/teencode.csv"
stopwords_path = "resources/stopwords.txt"
phrases_path = "resources/phrase_mapping.csv"
invalid_output_path = "log/invalid_reviews.csv"

In [6]:
# Logging setup
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "train_mf_model.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler(log_file, encoding="utf-8"), logging.StreamHandler()],
)

In [None]:
def load_data(
    file_path: str,
    sentiment_model=None,
    text_processor=None,
    filter_mismatch: bool = False,
    invalid_output_path: str = 'reviews_invalid.csv',
):
    """
    Load rating data and optionally filter out mismatched sentiment-rating rows.
    Keeps reviews without comments and only filters ones with comment mismatches.

    Parameters:
        file_path (str): Path to ratings CSV file.
        sentiment_model: Pre-trained sentiment classification model.
        text_processor: TextProcessor instance.
        filter_mismatch (bool): Whether to remove mismatched sentiment-rating pairs.
        invalid_output_path (str): File path to store removed rows if any.

    Returns:
        Dataset: surprise.Dataset object
    """
    from surprise import Dataset, Reader

    logging.info(f"Loading data from {file_path}")
    df = pd.read_csv(file_path)

    total_before = len(df)

    if filter_mismatch:
        if sentiment_model is None or text_processor is None:
            raise ValueError("Sentiment model and processor are required for filtering.")

        logging.info("Filtering mismatched sentiment and rating...")

        # Identify rows with comment to process
        comment_mask = df["comment"].notna() & df["comment"].str.strip().ne("")

        df_comment = df[comment_mask].copy()
        df_comment["processed"] = df_comment["comment"].astype(str).apply(text_processor.preprocess)
        df_comment["predicted_sentiment"] = sentiment_model.predict(df_comment["processed"])

        # Valid if rating matches predicted sentiment
        df_comment["is_valid"] = ~(
            ((df_comment["rating"] >= 4) & (df_comment["predicted_sentiment"] == "neg")) |
            ((df_comment["rating"] <= 2) & (df_comment["predicted_sentiment"] == "pos"))
        )

        # Combine: keep valid comments + all no-comment rows
        mismatched = df_comment[~df_comment["is_valid"]]
        valid_comments = df_comment[df_comment["is_valid"]]
        df_nocomment = df[~comment_mask]
        df_final = pd.concat([valid_comments, df_nocomment], ignore_index=True)

        # Logging
        logging.info(f"Total reviews before filtering: {total_before}")
        logging.info(f"Reviews with comment: {len(df_comment)}")
        logging.info(f"Reviews removed due to mismatch: {len(mismatched)}")
        logging.info(f"Remaining reviews after filtering: {len(df_final)}")

        # Save mismatched reviews
        parent_dir = os.path.dirname(invalid_output_path)
        if parent_dir:
            os.makedirs(parent_dir, exist_ok=True)
        mismatched.to_csv(invalid_output_path, index=False, encoding="utf-8")
        logging.info(f"Mismatched reviews saved to {invalid_output_path}")

        df = df_final

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[["userId", "productId", "rating"]], reader)
    return data


def train_model(data, algorithm="SVD", params=None):
    logging.info(f"Training model using algorithm: {algorithm}")
    trainset, testset = train_test_split(data, test_size=0.2)


    if algorithm == "SVD":
        default_params = {
            "n_factors": 20,
            "n_epochs": 50,
            "lr_all": 0.005,
            "reg_all": 0.01,
        }
        model = SVD(**(params or default_params))
    elif algorithm == "NMF":
        default_params = {"n_factors": 50, "n_epochs": 50}
        model = NMF(**(params or default_params))
    elif algorithm == "KNNBasic":
        default_params = {
            "k": 40,
            "sim_options": {"name": "cosine", "user_based": False},
        }
        model = KNNBasic(**(params or default_params))
    else:
        raise ValueError(f"Unsupported algorithm: {algorithm}")

    model.fit(trainset)

    predictions = model.test(testset)

    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)


    logging.info(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    return model, rmse, mae


def save_model(model, output_path="model.pkl"):
    with open(output_path, "wb") as f:
        pickle.dump(model, f)
    logging.info(f"Model saved to {output_path}")



def load_model(model_path="model.pkl"):
    try:
        with open(model_path, "rb") as f:
            model = pickle.load(f)
        logging.info(f"Model loaded from {model_path}")


        return model
    except Exception as e:
        logging.error(f"Failed to load model: {e}")
        return None


def load_sentiment_model(model_path):
    """Load a trained sentiment analysis model from file."""
    try:
        model = joblib.load(model_path)
        logging.info(f"Model loaded from {model_path}")
        return model
    except Exception as e:
        logging.error(f"Failed to load model: {e}")
        exit(1)


def get_user_recommendations(model, df, userId, top_k=10, exclude_purchased=True):
    all_products = df["productId"].unique()
    user_products = set(df[df["userId"] == userId]["productId"].values)

    predictions = []

    for productId in all_products:
        if exclude_purchased and productId in user_products:
            continue
        pred = model.predict(userId, productId)
        predictions.append((productId, pred.est))

    predictions.sort(key=lambda x: x[1], reverse=True)
    top_recs = [prod_id for prod_id, _ in predictions[:top_k]]
    return top_recs
  

In [20]:
logging.info("Start training pipeline")
data = load_data(data_path)
model, rmse, mae = train_model(data, algorithm, custom_params)
save_model(model, output_path)

print("✅ Training complete")
print("RMSE:", rmse)
print("MAE:", mae)
print("Model saved to:", output_path)

2025-06-21 23:03:07,978 - INFO - Start training pipeline
2025-06-21 23:03:07,980 - INFO - Loading data from data/processed/reviews.csv
2025-06-21 23:03:08,664 - INFO - Training model using algorithm: SVD
2025-06-21 23:03:14,830 - INFO - RMSE: 0.5547, MAE: 0.2968
2025-06-21 23:03:15,194 - INFO - Model saved to models/mf_model.pkl


✅ Training complete
RMSE: 0.5547209462162024
MAE: 0.29682866027151567
Model saved to: models/mf_model.pkl


In [8]:
from preprocessing.text_processor import TextProcessor

# Initialize text processor
processor = TextProcessor(
    teencode_path=teencode_path,
    stopword_path=stopwords_path,
    phrase_mapping_path=phrases_path,
)

In [9]:
logging.info("Start training pipeline")
# Load model and preprocess input
sentiment_model = load_sentiment_model("models/20250621_svm_model.pkl")
data = load_data(data_path, sentiment_model, processor, filter_mismatch=True, invalid_output_path=invalid_output_path)
model, rmse, mae = train_model(data, algorithm, custom_params)
save_model(model, output_path)

print("✅ Training complete")
print("RMSE:", rmse)
print("MAE:", mae)
print("Model saved to:", output_path)

2025-06-21 23:47:12,594 - INFO - Start training pipeline
2025-06-21 23:47:12,723 - INFO - Model loaded from models/20250621_svm_model.pkl
2025-06-21 23:47:12,725 - INFO - Loading data from data/processed/reviews.csv
2025-06-21 23:47:13,249 - INFO - Filtering mismatched sentiment and rating...
2025-06-21 23:50:03,380 - INFO - Total reviews before filtering: 99520
2025-06-21 23:50:03,381 - INFO - Reviews removed due to mismatch: 17031
2025-06-21 23:50:03,382 - INFO - Remaining reviews after filtering: 82489
2025-06-21 23:50:03,621 - INFO - Mismatched reviews saved to log/invalid_reviews.csv
2025-06-21 23:50:03,776 - INFO - Training model using algorithm: SVD
2025-06-21 23:50:05,445 - INFO - RMSE: 0.8708, MAE: 0.5462
2025-06-21 23:50:05,554 - INFO - Model saved to models/mf_model.pkl


✅ Training complete
RMSE: 0.870793846548323
MAE: 0.546169456816573
Model saved to: models/mf_model.pkl


In [21]:
df = pd.read_csv(data_path)
mf_model = load_model(output_path)

2025-06-21 23:03:16,184 - INFO - Model loaded from models/mf_model.pkl


In [22]:
get_user_recommendations(mf_model, df, 21665899)

[138934620,
 140120577,
 1667493,
 194130726,
 216090625,
 251928260,
 263980370,
 276107155,
 276256755,
 277024165]