# Training Machine Learning with Some Models
Hybrid Collaborative Filtering and Content-Based Filtering for Improved Recommender System

## LightGCN
Collaborative filtering

In [None]:
pip install recommenders pandera



In [None]:
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0]
Pandas version: 2.0.3
Tensorflow version: 2.15.0


In [None]:
# import essential libraries
import pandas as pd
import numpy as np

# read dataset
data = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
data.head()

Unnamed: 0,userID,birthday,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp
0,577980460,1988-12-17,Female,22,Lẩu nấm thập cẩm,157000,6,5,1365811200
1,517030417,2006-12-25,Female,29,Rau thêm,10000,8,5,1341100800
2,265129175,2014-01-24,Female,2,Chả giò,80000,1,1,1367193600
3,631007419,2000-07-23,Male,2,Chả giò,80000,1,3,1374451200
4,484119129,1977-05-22,Female,2,Chả giò,80000,1,1,1334707200


In [None]:
from datetime import datetime

def calculate_age(birthday):
    today = datetime.today()
    birth_date = pd.to_datetime(birthday)
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age

# Add age column into DataFrame
data['age'] = data['birthday'].apply(calculate_age)

# Remove birthday column
data = data.drop(columns=['birthday'])

data.head()

Unnamed: 0,userID,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp,age
0,577980460,Female,22,Lẩu nấm thập cẩm,157000,6,5,1365811200,35
1,517030417,Female,29,Rau thêm,10000,8,5,1341100800,17
2,265129175,Female,2,Chả giò,80000,1,1,1367193600,10
3,631007419,Male,2,Chả giò,80000,1,3,1374451200,24
4,484119129,Female,2,Chả giò,80000,1,1,1334707200,47


In [None]:
featuresTrain = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID', 'timestamp']
data = data[featuresTrain]
data.head()

Unnamed: 0,userID,itemID,gender,age,itemPrice,rating,categoryID,timestamp
0,577980460,22,Female,35,157000,5,6,1365811200
1,517030417,29,Female,17,10000,5,8,1341100800
2,265129175,2,Female,10,80000,1,1,1367193600
3,631007419,2,Male,24,80000,3,1,1374451200
4,484119129,2,Female,47,80000,1,1,1334707200


In [None]:
train, test = python_stratified_split(data, ratio=0.8)
print(train)

          userID  itemID  gender  age  itemPrice  rating  categoryID  \
141   0100130890      14  Female   13      45000       2           4   
860   0100657606      12  Female   51      70000       5           3   
1918  0101089548      37  Female   43      30000       3          11   
566   0101089548      12  Female   43      70000       5           3   
776   0101872029      12    Male   41      70000       5           3   
...          ...     ...     ...  ...        ...     ...         ...   
850   0999258356      12    Male   23      70000       5           3   
861   0999258356      12    Male   23      70000       5           3   
792   0999750397      12    Male   49      70000       4           3   
1253  0999823778      34    Male   26      15000       1          10   
1138  0999823778      12    Male   26      70000       5           3   

       timestamp  
141   1382832000  
860   1387670400  
1918  1293667200  
566   1404604800  
776   1353542400  
...          ...  
85

In [None]:
SEED = DEFAULT_SEED  # Set None for non-deterministic results

data1 = ImplicitCF(train=train, test=test, seed=SEED)

In [None]:
yaml_file = "./lightgcn.yaml"
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 50
# BATCH_SIZE = 1024
BATCH_SIZE = 2048

# hparams = prepare_hparams(yaml_file,
#                           n_layers=3,
#                           batch_size=BATCH_SIZE,
#                           epochs=EPOCHS,
#                           learning_rate=0.005,
#                           eval_epoch=5,
#                           top_k=TOP_K,
#                          )
hparams = prepare_hparams(yaml_file,
                          n_layers=4,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.01,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [None]:
modelLightGCN = LightGCN(hparams, data1, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [None]:
with Timer() as train_time:
    modelLightGCN.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)0.1s: train loss = 0.68582 = (mf)0.68568 + (embed)0.00013
Epoch 2 (train)0.0s: train loss = 0.66683 = (mf)0.66668 + (embed)0.00015
Epoch 3 (train)0.0s: train loss = 0.63447 = (mf)0.63431 + (embed)0.00016
Epoch 4 (train)0.0s: train loss = 0.58787 = (mf)0.58769 + (embed)0.00017
Epoch 5 (train)0.0s + (eval)0.1s: train loss = 0.52914 = (mf)0.52895 + (embed)0.00019, recall = 0.37063, ndcg = 0.31172, precision = 0.03706, map = 0.29280
Epoch 6 (train)0.0s: train loss = 0.46358 = (mf)0.46338 + (embed)0.00020
Epoch 7 (train)0.0s: train loss = 0.39826 = (mf)0.39804 + (embed)0.00022
Epoch 8 (train)0.0s: train loss = 0.33339 = (mf)0.33315 + (embed)0.00024
Epoch 9 (train)0.0s: train loss = 0.27119 = (mf)0.27093 + (embed)0.00026
Epoch 10 (train)0.0s + (eval)0.1s: train loss = 0.22347 = (mf)0.22319 + (embed)0.00028, recall = 0.37762, ndcg = 0.31480, precision = 0.03776, map = 0.29498
Epoch 11 (train)0.0s: train loss = 0.19158 = (mf)0.19128 + (embed)0.00030
Epoch 12 (train)0.0s: train l

In [None]:
topk_scores = modelLightGCN.recommend_k_items(test, top_k=TOP_K, remove_seen=False) # Remove seen is param to decide hidden item which user have seen
topk_scores.head()
print(topk_scores)

          userID  itemID  prediction
0     0105289402      12   11.243765
1     0105289402      37    7.435783
2     0105289402       6    5.420693
3     0105289402       9    3.457200
4     0105289402       4    1.497637
...          ...     ...         ...
2855  0999258356      25    1.009144
2856  0999258356       7    0.863402
2857  0999258356      19    0.748565
2858  0999258356      17    0.636532
2859  0999258356      10    0.434554

[2860 rows x 3 columns]


In [None]:
eval_map = map(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.646284
NDCG:	0.706381
Precision@K:	0.088811
Recall@K:	0.888112


In [None]:
# Record results for tests - ignore this cell
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)

In [None]:
def recommend_items (user_id):
  return topk_scores[topk_scores['userID'] == user_id]['itemID'].values # Return type is array

recommend_items_for_user = recommend_items('0105289402')
print(recommend_items_for_user)
# print(recommend_items_for_user[0])

def predict_LightGCN (user_id):
  result = topk_scores[topk_scores['userID'] == user_id]
  return result.drop(columns=['userID'])

[12 37  6  9  4 25  7 17 19 34]


## LightGBM
Content based filtering

In [None]:
import os
import sys
import numpy as np
import lightgbm as lgb
import pandas as pd
import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss

import recommenders.datasets.criteo as criteo
import recommenders.models.lightgbm.lightgbm_utils as lgb_utils
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("LightGBM version: {}".format(lgb.__version__))

System version: 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0]
LightGBM version: 4.1.0


In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# read dataset
data = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
from datetime import datetime

def calculate_age(birthday):
    today = datetime.today()
    birth_date = pd.to_datetime(birthday)
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age

# Add age column into DataFrame
data['age'] = data['birthday'].apply(calculate_age)

# Remove birthday column
data = data.drop(columns=['birthday'])

featuresTrain = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID']
data = data[featuresTrain]
# data.head()

# user_features = data[['userID', 'gender', 'age']]
# item_features = data[['itemID', 'itemPrice', 'categoryID']]
# interactions = data[['userID', 'itemID', 'rating']]
# # print(interactions)

# # Merge user and item features
# interactions = interactions.merge(user_features, on='userID')
# interactions = interactions.merge(item_features, on='itemID')

# # print(interactions)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(train_data)

          userID  itemID  gender  age  itemPrice  rating  categoryID
968   0435321829      12  Female   40      70000       5           3
240   0497848626      12  Female   41      70000       5           3
819   0597552837      12    Male   39      70000       5           3
692   0969163172      12    Male   53      70000       3           3
420   0580121827      12  Female   37      70000       5           3
...          ...     ...     ...  ...        ...     ...         ...
1130  0316499354      12    Male   40      70000       5           3
1294  0320477835      25    Male   30      82000       5           7
860   0100657606      12  Female   51      70000       5           3
1459  0871237272      37  Female   22      30000       4          11
1126  0259395934      12  Female   16      70000       5           3

[1600 rows x 7 columns]


In [None]:
# Tạo các feature và label cho mô hình
X_train = train_data.drop(['rating'], axis=1)
y_train = train_data['rating']
X_test = test_data.drop(['rating'], axis=1)
y_test = test_data['rating']

# Chuyển đổi cột gender trong tập huấn luyện
X_train['gender'] = pd.factorize(X_train['gender'])[0]
X_train['userID'] = X_train['userID'].astype(int)

# Chuyển đổi cột gender trong tập test
X_test['gender'] = pd.factorize(X_test['gender'])[0]
X_test['userID'] = X_test['userID'].astype(int)

# Tạo dataset cho LightGBM
train_dataset = lgb.Dataset(X_train, label=y_train)
test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)

# Định nghĩa các tham số của mô hình
params = {
    'objective': 'regression',
    'metric': 'rmse'
}
# MAX_LEAF = 64
# MIN_DATA = 20
# NUM_OF_TREES = 100
# TREE_LEARNING_RATE = 0.15
# EARLY_STOPPING_ROUNDS = 20
# METRIC = "auc"
# SIZE = "sample"
# params = {
#     "task": "train",
#     "boosting_type": "gbdt",
#     "num_class": 1,
#     "objective": "binary",
#     "metric": METRIC,
#     "num_leaves": MAX_LEAF,
#     "min_data": MIN_DATA,
#     "boost_from_average": True,
#     # set it according to your cpu cores.
#     "num_threads": 20,
#     "feature_fraction": 0.8,
#     "learning_rate": TREE_LEARNING_RATE,
# }

# Huấn luyện mô hình
modelLightGBM = lgb.train(params, train_dataset, valid_sets=[test_dataset], callbacks=[lgb.early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 367
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 6
[LightGBM] [Info] Start training from score 4.151250
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's rmse: 1.30004


In [None]:
# Dự đoán trên tập test
y_pred = modelLightGBM.predict(X_test)
# print(y_pred)

# Đánh giá mô hình
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# pred = model.predict('')

RMSE: 1.3000404196818385


In [276]:
def predict_LightGBM (user_id_predict, user_gender, user_age_predict):
  user_id_predict = int(user_id_predict)
  user_gender_predict = 0 if user_gender == 'Female' else 1

  df_item_predict = pd.read_json('./data_item.json').drop_duplicates(subset='_id')
  df_item_predict = df_item_predict[['_id', 'price', 'category_id']]
  df_item_predict.columns = ['itemID', 'itemPrice', 'categoryID']
  df_item_predict_length = df_item_predict.shape[0]

  df_user_predict = pd.DataFrame({
      'userID': [user_id_predict] * df_item_predict_length,  # Tạo 1000 hàng với cùng một user
      'gender': [user_gender_predict] * df_item_predict_length,  # Thông tin về user
      'age': [user_age_predict] * df_item_predict_length,  # Thông tin về user
  })

  predict_data = pd.concat([df_user_predict, df_item_predict], axis=1)

  # Dự đoán điểm số cho các item
  predict_data_result = modelLightGBM.predict(predict_data)

  # Thêm cột dự đoán vào dataframe
  predict_data['prediction_lightgbm'] = predict_data_result

  recommended_items = predict_data.sort_values(by='prediction_lightgbm', ascending=False)
  TOP = 10
  top_N_recommendations = recommended_items.head(TOP)
  return top_N_recommendations[['itemID', 'prediction_lightgbm']]

## Combine Two Models

In [279]:
# Gọi đến hàm của 2 mô hình trên và training
# Kết hợp 2 mô hình

user_id_predict = '0105289402' #0435321829
user_gender = 'Female'
user_age_predict = 25

recommend_items_LightGBM = predict_LightGBM(user_id_predict, user_gender, user_age_predict)
recommend_items_LightGCN = predict_LightGCN(user_id_predict)

# print(recommend_items_LightGBM)
# print(recommend_items_LightGCN)

df_combined = pd.merge(recommend_items_LightGCN, recommend_items_LightGBM, on=['itemID'], how='outer')
df_combined['prediction'].fillna(0, inplace=True)
df_combined['prediction_lightgbm'].fillna(0, inplace=True)
# print(df_combined)

df_combined['predicted_score_avg'] = (df_combined['prediction_lightgbm'] + df_combined['prediction']) / 2
print(df_combined)

    itemID  prediction  prediction_lightgbm  predicted_score_avg
0       12   11.243765             4.281925             7.762845
1       37    7.435783             0.000000             3.717892
2        6    5.420693             3.979540             4.700116
3        9    3.457200             0.000000             1.728600
4        4    1.497637             4.228589             2.863113
5       25    1.360184             4.176715             2.768450
6        7    1.172407             0.000000             0.586204
7       17    0.899385             0.000000             0.449693
8       19    0.736516             0.000000             0.368258
9       34    0.669944             0.000000             0.334972
10      23    0.000000             4.176715             2.088358
11      22    0.000000             4.176715             2.088358
12      18    0.000000             4.143888             2.071944
13       1    0.000000             3.979540             1.989770
14      11    0.000000   

In [281]:
# Sắp xếp theo điểm số dự đoán
df_combined_sorted = df_combined.sort_values(by='predicted_score_avg', ascending=False)

# Chọn top-N item để gợi ý cho người dùng
top_n_items = df_combined_sorted.head(10)
print(top_n_items)

    itemID  prediction  prediction_lightgbm  predicted_score_avg
0       12   11.243765             4.281925             7.762845
2        6    5.420693             3.979540             4.700116
1       37    7.435783             0.000000             3.717892
4        4    1.497637             4.228589             2.863113
5       25    1.360184             4.176715             2.768450
10      23    0.000000             4.176715             2.088358
11      22    0.000000             4.176715             2.088358
12      18    0.000000             4.143888             2.071944
13       1    0.000000             3.979540             1.989770
14      11    0.000000             3.979540             1.989770
