## Sentiment analysis

### Step 1: Prepare the data

In [54]:
import pandas as pd

In [55]:
reviews = pd.read_csv('../02.Dataset/labeled/reviews.csv')

In [56]:
reviews.head()

Unnamed: 0,ProductID,CustomerID,Rating,Comment,Label
0,74021317,7991785,5,Một quyển sách hay,pos
1,187827003,18150739,5,"Mình đã từng đọc sơ nội dung sách, rất hay, rấ...",pos
2,271380890,497788,5,"Quyển sách đẹp về hình thức, nội dung mới đọc ...",pos
3,74021317,19165924,5,"Sách đẹp, hài lòng",pos
4,105483727,10170816,5,"sách đóng gói cẩn thận, giao hành nhanh",pos


In [57]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4380 entries, 0 to 4379
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ProductID   4380 non-null   int64 
 1   CustomerID  4380 non-null   int64 
 2   Rating      4380 non-null   int64 
 3   Comment     4380 non-null   object
 4   Label       4380 non-null   object
dtypes: int64(3), object(2)
memory usage: 171.2+ KB


In [58]:
reviews['Label'].value_counts()

Label
neg    2101
pos    1923
neu     356
Name: count, dtype: int64

In [59]:
reviews.dropna(inplace=True)

In [60]:
print(reviews['Label'].unique())

['pos' 'neu' 'neg']


### Step 2: Data preprocessing

In [61]:
import string
import emoji
import re


# Clean icons
def clean_icons(text):
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r"[:;][-~]?[)D(/\\|pP]", "", text)
    text = text.replace("_x000D_", " ")
    return text

def lower(text):
    return text.lower().strip()

def remove_links(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

# Convert comment to a full sentence
def convert_teencode_to_vietnamese(sentence, dictionary):
    words = sentence.split()
    converted_words = []
    for word in words:
        if word in dictionary:
            converted_words.append(dictionary[word])
            continue

        punctuation = ""
        temp_word = word
        while temp_word and temp_word[-1] in string.punctuation:
            punctuation = temp_word[-1] + punctuation
            temp_word = temp_word[:-1]
        if temp_word in dictionary:
            converted_words.append(dictionary[temp_word] + punctuation)
            continue

        leading_punctuation = ""
        temp_word = word
        while temp_word and temp_word[0] in string.punctuation:
            leading_punctuation += temp_word[0]
            temp_word = temp_word[1:]
        if temp_word in dictionary:
            converted_words.append(leading_punctuation + dictionary[temp_word])
            continue

        converted_words.append(word)

    return " ".join(converted_words)

# Remove stopwords
vietnamese_stopwords = ["là", "thì", "và", "nhưng", "các", "một", "những", "với", "cho", "này", "đã", "ở", "được", "rất", "có"]


def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in vietnamese_stopwords])

In [62]:
from pyvi import ViTokenizer

def word_segmentation(text):
  if pd.isna(text):
      return ""
  return ViTokenizer.tokenize(text)

In [63]:
import csv

dictionary = {}
with open(
    "../02.Dataset/teencode.csv",
    mode="r",
    encoding="utf-8",
) as file:
    reader = csv.DictReader(file)
    for row in reader:
        dictionary[row["Teencode"]] = row["Meaning"]

reviews["Comment"] = reviews["Comment"].apply(clean_icons)
reviews["Comment"] = reviews["Comment"].apply(lower)
reviews["Comment"] = reviews["Comment"].apply(remove_links)
reviews["Comment"] = reviews["Comment"].apply(
    lambda x: convert_teencode_to_vietnamese(x, dictionary)
)

# Word segmentation
reviews["Comment"] = reviews["Comment"].apply(word_segmentation)

reviews.dropna(subset=["Comment"], inplace=True)
reviews.reset_index(drop=True, inplace=True)

KeyboardInterrupt: 

In [None]:
# # Word segmentation
# from underthesea import word_tokenize

# text = "Sản phẩm rất tốt và giao hàng nhanh"
# print(word_tokenize(text))

In [None]:
reviews.head(100)

Unnamed: 0,ProductID,CustomerID,Rating,Comment,Label
0,74021317,7991785,5,một quyển sách hay,pos
1,187827003,18150739,5,"mình đã từng đọc sơ nội_dung sách , rất hay , ...",pos
2,271380890,497788,5,"quyển sách đẹp về hình_thức , nội_dung mới đọc...",pos
3,74021317,19165924,5,"sách đẹp , hài_lòng",pos
4,105483727,10170816,5,"sách đóng_gói cẩn_thận , giao hành nhanh",pos
...,...,...,...,...,...
95,136340700,16493140,5,"đặt sau 1 tiếng có hàng rồi , quá nhanh quá yê...",pos
96,54614797,10456907,5,"bìa đẹp , giao hàng siêu nhanh , đóng_gói cẩn_...",pos
97,113530805,18051370,5,sách viết về các trải nghiệm của tác_giả . 1 c...,pos
98,8886007,16290383,5,"ngay sau khi đọc xong người đua diều , mình đã...",pos


### Step 3: Split the dataset to train set and test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = reviews["Comment"]

y = reviews["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

### Step 4: Train and fit the model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.svm import LinearSVC
text_clf_svm = Pipeline([("tfidf", TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8, sublinear_tf=True)), ("clf", LinearSVC())])
text_clf_svm.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
text_clf_log = Pipeline([("tfidf", TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8, sublinear_tf=True)), ("clf", LogisticRegression())])
text_clf_log.fit(X_train, y_train)

### Step 5: Run predictions and analyze the results

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

predictions_svm = text_clf_svm.predict(X_test)

print("-----------------SVM-----------------")
print("Confusion matrix:\n", confusion_matrix(y_test, predictions_svm))
print()
print(f"Accuracy score: {accuracy_score(y_test, predictions_svm)}")
print()
print("Classification report:\n", classification_report(y_test, predictions_svm))

-----------------SVM-----------------
Confusion matrix:
 [[567  52]
 [ 48 541]]

Accuracy score: 0.9172185430463576

Classification report:
               precision    recall  f1-score   support

         neg       0.92      0.92      0.92       619
         pos       0.91      0.92      0.92       589

    accuracy                           0.92      1208
   macro avg       0.92      0.92      0.92      1208
weighted avg       0.92      0.92      0.92      1208



In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

predictions_log = text_clf_log.predict(X_test)

print("-----------------LogisticRegression-----------------")
print("Confusion matrix:\n", confusion_matrix(y_test, predictions_log))
print()
print(f"Accuracy score: {accuracy_score(y_test, predictions_log)}")
print()
print("Classification report:\n", classification_report(y_test, predictions_log))

-----------------LogisticRegression-----------------
Confusion matrix:
 [[567  52]
 [ 48 541]]

Accuracy score: 0.9172185430463576

Classification report:
               precision    recall  f1-score   support

         neg       0.92      0.92      0.92       619
         pos       0.91      0.92      0.92       589

    accuracy                           0.92      1208
   macro avg       0.92      0.92      0.92      1208
weighted avg       0.92      0.92      0.92      1208



In [None]:
text_clf_log.predict(["Sách hay quá", "Sách dở quá", "Sách bình thường", "Sách không hay lắm", "Sách dính các trang vào nhau", "Đã nhận hơi trễ  ! Sớm hơn về sao dùm"])

array(['pos', 'neg', 'pos', 'neg', 'neg', 'neg'], dtype=object)

### Save the model

In [None]:
%pip install joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import joblib

In [None]:
joblib.dump(text_clf_svm, "../05.Models/text_clf_svm.pkl")
joblib.dump(text_clf_log, "../05.Models/text_clf_log.pkl")

['../05.Models/text_clf_log.pkl']

## Adjust ratings

In [None]:
import joblib

text_clf_svm = joblib.load("../05.Models/text_clf_svm.pkl")

In [None]:
import pandas as pd
reviews = pd.read_csv('../02.Dataset/reviews_300k.csv') 

In [None]:
reviews.dropna(inplace=True)
print(f'Total number of reviews: {len(reviews)}')

Total number of reviews: 296836


In [None]:
reviews_with_comment = reviews[reviews['comment'].notna()].copy()
reviews_with_comment.reset_index(drop=True, inplace=True)
print(f'Total number of reviews with comment: {len(reviews_with_comment)}')

Total number of reviews with comment: 99520


In [None]:
reviews_with_comment.head()

Unnamed: 0,userId,productId,rating,comment
0,1394862,102440084,5,Tác phẩm có giá trị về tuyên truyền hơn là giá...
1,14051602,102440084,5,Tác phẩm theo mình đánh giá là một trong những...
2,11443500,102440084,4,Ok
3,11441614,102440084,4,Tốt
4,48078,105794407,4,"Trải qua bao binh lửa, nhìn thấy sách của cụ L..."


In [None]:
reviews_with_comment['sentiment'] = text_clf_svm.predict(reviews_with_comment['comment'].tolist())

In [None]:
reviews_with_comment.head()

Unnamed: 0,userId,productId,rating,comment,sentiment
0,1394862,102440084,5,Tác phẩm có giá trị về tuyên truyền hơn là giá...,pos
1,14051602,102440084,5,Tác phẩm theo mình đánh giá là một trong những...,pos
2,11443500,102440084,4,Ok,pos
3,11441614,102440084,4,Tốt,pos
4,48078,105794407,4,"Trải qua bao binh lửa, nhìn thấy sách của cụ L...",neg


In [None]:
def advanced_rating_adjustment(original_rating, sentiment, comment=None, alpha=0.5):
    """
    Adjusts rating based on sentiment and the length/detail of the comment.

    Parameters:
        original_rating: int - the original user rating (1-5)
        sentiment: str - 'pos' or 'neg'
        comment: str - the textual review content
        alpha: float - balance factor (0 to 1) between rating and comment confidence

    Returns:
        float - adjusted rating
    """
    # Convert sentiment to a baseline score
    base_sentiment_score = 4.5 if sentiment == "pos" else 1.5

    # Analyze sentiment confidence based on comment length/detail
    comment_reliability = 0.5  # Default confidence

    if comment:
        comment_length = len(comment.strip())
        if comment_length < 5:  # Very short (e.g., "Ok", "Tốt")
            comment_reliability = 0.3
        elif comment_length < 20:  # Short
            comment_reliability = 0.5
        elif comment_length < 100:  # Moderate
            comment_reliability = 0.7
        else:  # Long and detailed comment
            comment_reliability = 0.9

    # Detect inconsistency between sentiment and rating
    inconsistency_level = 0

    if sentiment == "pos":
        if original_rating <= 2:
            inconsistency_level = 1.0  # Highly inconsistent
        elif original_rating == 3:
            inconsistency_level = 0.5  # Mildly inconsistent
    elif sentiment == "neg":
        if original_rating >= 4:
            inconsistency_level = 1.0
        elif original_rating == 3:
            inconsistency_level = 0.5

    # Compute adjustment weight
    adjustment_strength = inconsistency_level * comment_reliability

    # Adjust rating
    if inconsistency_level > 0:
        adjusted = (
            1 - adjustment_strength
        ) * original_rating + adjustment_strength * base_sentiment_score
    else:
        # If consistent, apply minor adjustment
        adjusted = 0.9 * original_rating + 0.1 * base_sentiment_score

    return round(adjusted, 1)


def evaluate_adjustment(df):
    """
    Đánh giá thuật toán điều chỉnh

    Tham số:
        df: DataFrame - chứa thông tin rating gốc, cảm xúc và rating điều chỉnh
    """
    # Tính số lượng điều chỉnh
    df["significant_change"] = abs(df["rating"] - df["adjusted_rating"]) >= 0.5

    # Thống kê
    total_reviews = len(df)
    adjusted_reviews = df["significant_change"].sum()

    # Các trường hợp điều chỉnh
    pos_low_rating = ((df["sentiment"] == "pos") & (df["rating"] <= 3)).sum()
    neg_high_rating = ((df["sentiment"] == "neg") & (df["rating"] >= 4)).sum()

    print(f"Tổng số đánh giá: {total_reviews}")
    print(
        f"Số đánh giá được điều chỉnh đáng kể: {adjusted_reviews} ({adjusted_reviews/total_reviews*100:.1f}%)"
    )
    print(f"Đánh giá tích cực nhưng rating thấp: {pos_low_rating}")
    print(f"Đánh giá tiêu cực nhưng rating cao: {neg_high_rating}")

In [None]:
reviews_with_comment['adjusted_rating'] = reviews_with_comment.apply(
    lambda row: advanced_rating_adjustment(
        row["rating"], row["sentiment"], row["comment"]
    ),
    axis=1,
)

TypeError: <lambda>() got an unexpected keyword argument 'axis'

In [None]:
reviews_with_comment.head()

## Matrix factorization

### Tiki 45k reviews

In [None]:
import datetime
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read data from CSV file
data = pd.read_csv("../02.Dataset/reviews_45k.csv", encoding="utf-8")

# Get necessary columns
ratings = data[
    ["ProductID", "CustomerID", "Rating"]
].copy()  # Create a copy to avoid warnings

# Convert ProductID and CustomerID to integer indices
user_ids = ratings["CustomerID"].unique()
product_ids = ratings["ProductID"].unique()

user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
product_to_index = {product_id: idx for idx, product_id in enumerate(product_ids)}

ratings["user_idx"] = ratings["CustomerID"].map(user_to_index)
ratings["product_idx"] = ratings["ProductID"].map(product_to_index)

train_data = []
test_data = []

for user_id, group in ratings.groupby("CustomerID"):
    # Shuffle the ratings of each user and split into train and test
    shuffled_group = group.sample(frac=1, random_state=42)  # Shuffle the group
    split_index = int(len(shuffled_group) * 0.8)  # 80% for training, 20% for testing
    train_data.append(shuffled_group[:split_index])
    test_data.append(shuffled_group[split_index:])

# Combine the train and test data into separate DataFrames
train_data = pd.concat(train_data).reset_index(drop=True)
test_data = pd.concat(test_data).reset_index(drop=True)

# Split data into training and test sets
# train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Initialize parameters
n_users = len(user_ids)
n_products = len(product_ids)
n_factors = 20  # Number of latent factors
learning_rate = 0.005  # Reduced learning rate to prevent overflow
n_epochs = 50  # Number of iterations
reg = 0.01  # Regularization parameter

# Initialize latent factor matrices for users and products
# Use smaller initial values to prevent overflow
P = np.random.normal(0, 0.01, (n_users, n_factors))  # User matrix
Q = np.random.normal(0, 0.01, (n_products, n_factors))  # Product matrix


# Function to calculate RMSE
def rmse(predictions, actual):
    # Filter out any NaN values
    valid_indices = ~np.isnan(predictions)
    if not np.any(valid_indices):
        return float("nan")
    return np.sqrt(np.mean((predictions[valid_indices] - actual[valid_indices]) ** 2))


# Function to clip values to prevent overflow
def clip_value(value, min_val=-5.0, max_val=5.0):
    return max(min_val, min(max_val, value))


# Train the model with SGD
train_rmse_history = []
test_rmse_history = []

for epoch in range(n_epochs):
    # Train on training set
    for _, row in train_data.iterrows():
        u = int(row["user_idx"])
        i = int(row["product_idx"])
        r_ui = row["Rating"]

        # Predict rating
        prediction = np.dot(P[u, :], Q[i, :])
        # Clip prediction to prevent extreme values
        prediction = clip_value(prediction)
        error = r_ui - prediction

        # Update P and Q using SGD with gradient clipping
        for f in range(n_factors):
            p_update = learning_rate * (error * Q[i, f] - reg * P[u, f])
            q_update = learning_rate * (error * P[u, f] - reg * Q[i, f])

            # Clip updates to prevent overflow
            p_update = clip_value(p_update, -0.5, 0.5)
            q_update = clip_value(q_update, -0.5, 0.5)

            P[u, f] += p_update
            Q[i, f] += q_update

    # Calculate RMSE on training set
    train_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in train_data.iterrows()
        ]
    )
    train_rmse = rmse(train_predictions, train_data["Rating"].values)
    train_rmse_history.append(train_rmse)

    # Calculate RMSE on test set
    test_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in test_data.iterrows()
        ]
    )
    test_rmse = rmse(test_predictions, test_data["Rating"].values)
    test_rmse_history.append(test_rmse)

    print(
        f"Epoch {epoch + 1}/{n_epochs} - Train RMSE: {train_rmse:.4f} - Test RMSE: {test_rmse:.4f}"
    )

# Save the model
model_data = {
    "P": P,
    "Q": Q,
    "user_to_index": user_to_index,
    "product_to_index": product_to_index,
    "train_rmse_history": train_rmse_history,
    "test_rmse_history": test_rmse_history,
}

with open("../05.Models/mf_45k_model.pkl", "wb") as f:
    pickle.dump(model_data, f)
print("Model saved to mf_model.pkl")

# Save log
log_df = pd.DataFrame(
    {
        "epoch": range(1, n_epochs + 1),
        "train_rmse": train_rmse_history,
        "test_rmse": test_rmse_history,
    }
)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_df.to_csv(f"../06.Log/train_mf_45k_log_{timestamp}.csv", index=False, encoding="utf-8")
print("Training log saved to train_mf_log_{timestamp}.csv")


# Function to predict rating for a user-product pair
def predict_rating(user_id, product_id):
    if user_id not in user_to_index or product_id not in product_to_index:
        return None  # If user or product is not in the data
    u = user_to_index[user_id]
    i = product_to_index[product_id]
    # Clip prediction to avoid extreme values
    u = user_to_index[user_id]
    i = product_to_index[product_id]
    prediction = np.dot(P[u, :], Q[i, :])
    return clip_value(prediction)


# Example prediction
user_id_example = 18387707  # An example CustomerID
product_id_example = 192733741  # An example ProductID
predicted_rating = predict_rating(user_id_example, product_id_example)
if predicted_rating is not None:
    print(
        f"Predicted rating for User {user_id_example} and Product {product_id_example}: {predicted_rating:.2f}"
    )
else:
    print(
        f"Unable to predict for User {user_id_example} and Product {product_id_example} (not in training data)"
    )

# Plot RMSE graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_epochs + 1), train_rmse_history, label="Train RMSE")
plt.plot(range(1, n_epochs + 1), test_rmse_history, label="Test RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title("Matrix Factorization Learning Curve")
plt.legend()
plt.grid(True)
plt.show()

### Tiki 300k reviews

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read data from CSV file
data = pd.read_csv("../02.Dataset/reviews_300k.csv", encoding="utf-8")

# Get necessary columns
ratings = data[
    ["userId", "productId", "rating"]
].copy()  # Create a copy to avoid warnings

# Convert ProductID and CustomerID to integer indices
user_ids = ratings["userId"].unique()
product_ids = ratings["productId"].unique()

user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
product_to_index = {product_id: idx for idx, product_id in enumerate(product_ids)}

ratings["user_idx"] = ratings["userId"].map(user_to_index)
ratings["product_idx"] = ratings["productId"].map(product_to_index)

train_data = []
test_data = []

for user_id, group in ratings.groupby("userId"):
    # Shuffle the ratings of each user and split into train and test
    shuffled_group = group.sample(frac=1, random_state=42)  # Shuffle the group
    split_index = int(len(shuffled_group) * 0.8)  # 80% for training, 20% for testing
    train_data.append(shuffled_group[:split_index])
    test_data.append(shuffled_group[split_index:])

# Combine the train and test data into separate DataFrames
train_data = pd.concat(train_data).reset_index(drop=True)
test_data = pd.concat(test_data).reset_index(drop=True)

# Split data into training and test sets
# train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Initialize parameters
n_users = len(user_ids)
n_products = len(product_ids)
n_factors = 20  # Number of latent factors
learning_rate = 0.005  # Reduced learning rate to prevent overflow
n_epochs = 50  # Number of iterations
reg = 0.01  # Regularization parameter

# Initialize latent factor matrices for users and products
# Use smaller initial values to prevent overflow
P = np.random.normal(0, 0.01, (n_users, n_factors))  # User matrix
Q = np.random.normal(0, 0.01, (n_products, n_factors))  # Product matrix


# Function to calculate RMSE
def rmse(predictions, actual):
    # Filter out any NaN values
    valid_indices = ~np.isnan(predictions)
    if not np.any(valid_indices):
        return float("nan")
    return np.sqrt(np.mean((predictions[valid_indices] - actual[valid_indices]) ** 2))


# Function to clip values to prevent overflow
def clip_value(value, min_val=-5.0, max_val=5.0):
    return max(min_val, min(max_val, value))


# Train the model with SGD
train_rmse_history = []
test_rmse_history = []

for epoch in range(n_epochs):
    # Train on training set
    for _, row in train_data.iterrows():
        u = int(row["user_idx"])
        i = int(row["product_idx"])
        r_ui = row["rating"]

        # Predict rating
        prediction = np.dot(P[u, :], Q[i, :])
        # Clip prediction to prevent extreme values
        prediction = clip_value(prediction)
        error = r_ui - prediction

        # Update P and Q using SGD with gradient clipping
        for f in range(n_factors):
            p_update = learning_rate * (error * Q[i, f] - reg * P[u, f])
            q_update = learning_rate * (error * P[u, f] - reg * Q[i, f])

            # Clip updates to prevent overflow
            p_update = clip_value(p_update, -0.5, 0.5)
            q_update = clip_value(q_update, -0.5, 0.5)

            P[u, f] += p_update
            Q[i, f] += q_update

    # Calculate RMSE on training set
    train_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in train_data.iterrows()
        ]
    )
    train_rmse = rmse(train_predictions, train_data["rating"].values)
    train_rmse_history.append(train_rmse)

    # Calculate RMSE on test set
    test_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in test_data.iterrows()
        ]
    )
    test_rmse = rmse(test_predictions, test_data["rating"].values)
    test_rmse_history.append(test_rmse)

    print(
        f"Epoch {epoch + 1}/{n_epochs} - Train RMSE: {train_rmse:.4f} - Test RMSE: {test_rmse:.4f}"
    )

# Save the model
model_data = {
    "P": P,
    "Q": Q,
    "user_to_index": user_to_index,
    "product_to_index": product_to_index,
    "train_rmse_history": train_rmse_history,
    "test_rmse_history": test_rmse_history,
}

with open("../05.Models/mf_300k_model.pkl", "wb") as f:
    pickle.dump(model_data, f)
print("Model saved to mf_300k_model.pkl")

# Save log
log_df = pd.DataFrame(
    {
        "epoch": range(1, n_epochs + 1),
        "train_rmse": train_rmse_history,
        "test_rmse": test_rmse_history,
    }
)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_df.to_csv(
    f"../06.Log/train_mf_300k_log_{timestamp}.csv", index=False, encoding="utf-8"
)
print("Training log saved to train_mf_300k_log_{timestamp}.csv")

# Function to predict rating for a user-product pair
def predict_rating(user_id, product_id):
    if user_id not in user_to_index or product_id not in product_to_index:
        return None  # If user or product is not in the data
    u = user_to_index[user_id]
    i = product_to_index[product_id]
    # Clip prediction to avoid extreme values
    return clip_value(np.dot(P[u, :], Q[i, :]))


# Example prediction
user_id_example = 18387707  # An example CustomerID
product_id_example = 192733741  # An example ProductID
predicted_rating = predict_rating(user_id_example, product_id_example)
if predicted_rating is not None:
    print(
        f"Predicted rating for User {user_id_example} and Product {product_id_example}: {predicted_rating:.2f}"
    )
else:
    print(
        f"Unable to predict for User {user_id_example} and Product {product_id_example} (not in training data)"
    )

# Plot RMSE graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_epochs + 1), train_rmse_history, label="Train RMSE")
plt.plot(range(1, n_epochs + 1), test_rmse_history, label="Test RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title("Matrix Factorization Learning Curve")
plt.legend()
plt.grid(True)
plt.show()

### Movielens 1M

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read data from CSV file
data = pd.read_csv(
    "../02.Dataset/movielens/ratings.dat",
    sep="::",
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"],
)

# Get necessary columns
ratings = data[
    ["UserID", "MovieID", "Rating"]
].copy()  # Create a copy to avoid warnings

# Convert MovieID and CustomerID to integer indices
user_ids = ratings["UserID"].unique()
product_ids = ratings["MovieID"].unique()

user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
product_to_index = {product_id: idx for idx, product_id in enumerate(product_ids)}

ratings["user_idx"] = ratings["UserID"].map(user_to_index)
ratings["product_idx"] = ratings["MovieID"].map(product_to_index)

train_data = []
test_data = []

for user_id, group in ratings.groupby("UserID"):
    # Shuffle the ratings of each user and split into train and test
    shuffled_group = group.sample(frac=1, random_state=42)  # Shuffle the group
    split_index = int(len(shuffled_group) * 0.8)  # 80% for training, 20% for testing
    train_data.append(shuffled_group[:split_index])
    test_data.append(shuffled_group[split_index:])

# Combine the train and test data into separate DataFrames
train_data = pd.concat(train_data).reset_index(drop=True)
test_data = pd.concat(test_data).reset_index(drop=True)

# Split data into training and test sets
# train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Initialize parameters
n_users = len(user_ids)
n_products = len(product_ids)
n_factors = 20  # Number of latent factors
learning_rate = 0.005  # Reduced learning rate to prevent overflow
n_epochs = 50  # Number of iterations
reg = 0.01  # Regularization parameter

# Initialize latent factor matrices for users and products
# Use smaller initial values to prevent overflow
P = np.random.normal(0, 0.01, (n_users, n_factors))  # User matrix
Q = np.random.normal(0, 0.01, (n_products, n_factors))  # Product matrix


# Function to calculate RMSE
def rmse(predictions, actual):
    # Filter out any NaN values
    valid_indices = ~np.isnan(predictions)
    if not np.any(valid_indices):
        return float("nan")
    return np.sqrt(np.mean((predictions[valid_indices] - actual[valid_indices]) ** 2))


# Function to clip values to prevent overflow
def clip_value(value, min_val=-5.0, max_val=5.0):
    return max(min_val, min(max_val, value))


# Train the model with SGD
train_rmse_history = []
test_rmse_history = []

for epoch in range(n_epochs):
    # Train on training set
    for _, row in train_data.iterrows():
        u = int(row["user_idx"])
        i = int(row["product_idx"])
        r_ui = row["Rating"]

        # Predict rating
        prediction = np.dot(P[u, :], Q[i, :])
        # Clip prediction to prevent extreme values
        prediction = clip_value(prediction)
        error = r_ui - prediction

        # Update P and Q using SGD with gradient clipping
        for f in range(n_factors):
            p_update = learning_rate * (error * Q[i, f] - reg * P[u, f])
            q_update = learning_rate * (error * P[u, f] - reg * Q[i, f])

            # Clip updates to prevent overflow
            p_update = clip_value(p_update, -0.5, 0.5)
            q_update = clip_value(q_update, -0.5, 0.5)

            P[u, f] += p_update
            Q[i, f] += q_update

    # Calculate RMSE on training set
    train_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in train_data.iterrows()
        ]
    )
    train_rmse = rmse(train_predictions, train_data["Rating"].values)
    train_rmse_history.append(train_rmse)

    # Calculate RMSE on test set
    test_predictions = np.array(
        [
            clip_value(
                np.dot(P[int(row["user_idx"]), :], Q[int(row["product_idx"]), :])
            )
            for _, row in test_data.iterrows()
        ]
    )
    test_rmse = rmse(test_predictions, test_data["Rating"].values)
    test_rmse_history.append(test_rmse)

    print(
        f"Epoch {epoch + 1}/{n_epochs} - Train RMSE: {train_rmse:.4f} - Test RMSE: {test_rmse:.4f}"
    )

# Save the model
model_data = {
    "P": P,
    "Q": Q,
    "user_to_index": user_to_index,
    "product_to_index": product_to_index,
    "train_rmse_history": train_rmse_history,
    "test_rmse_history": test_rmse_history,
}

with open("../05.Models/mf_movielens_1m_model.pkl", "wb") as f:
    pickle.dump(model_data, f)
print("Model saved to mf_movielens_1m_model.pkl")

# Save log
log_df = pd.DataFrame(
    {
        "epoch": range(1, n_epochs + 1),
        "train_rmse": train_rmse_history,
        "test_rmse": test_rmse_history,
    }
)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_df.to_csv(
    f"../06.Log/train_mf_movielens_log_{timestamp}.csv", index=False, encoding="utf-8"
)
print("Training log saved to train_mf_movielens_1m_log_{timestamp}.csv")


# Function to predict rating for a user-product pair
def predict_rating(user_id, product_id):
    if user_id not in user_to_index or product_id not in product_to_index:
        return None  # If user or product is not in the data
    u = user_to_index[user_id]
    i = product_to_index[product_id]
    # Clip prediction to avoid extreme values
    return clip_value(np.dot(P[u, :], Q[i, :]))

# Plot RMSE graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_epochs + 1), train_rmse_history, label="Train RMSE")
plt.plot(range(1, n_epochs + 1), test_rmse_history, label="Test RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title("Matrix Factorization Learning Curve")
plt.legend()
plt.grid(True)
plt.show()