# Training Machine Learning with Some Models
Hybrid Collaborative Filtering and Content-Based Filtering for Improved Recommender System

## LightGCN
Collaborative filtering

In [1]:
pip install recommenders pandera

Collecting recommenders
  Downloading recommenders-1.2.0-py3-none-any.whl (356 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m356.0/356.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandera
  Downloading pandera-0.20.3-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.6/255.6 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cornac<2,>=1.15.2 (from recommenders)
  Downloading cornac-1.18.0-cp310-cp310-manylinux1_x86_64.whl (21.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightfm<2,>=1.17 (from recommenders)
  Downloading lightfm-1.17.tar.gz (316 kB)


In [2]:
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0]
Pandas version: 2.0.3
Tensorflow version: 2.15.0


In [3]:
# import essential libraries
import pandas as pd
import numpy as np

# read dataset
data = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
data.head()

Unnamed: 0,userID,birthday,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp
0,577980460,1988-12-17,Female,22,Lẩu nấm thập cẩm,157000,6,5,1365811200
1,517030417,2006-12-25,Female,29,Rau thêm,10000,8,5,1341100800
2,265129175,2014-01-24,Female,2,Chả giò,80000,1,1,1367193600
3,631007419,2000-07-23,Male,2,Chả giò,80000,1,3,1374451200
4,484119129,1977-05-22,Female,2,Chả giò,80000,1,1,1334707200


In [4]:
from datetime import datetime

def calculate_age(birthday):
    today = datetime.today()
    birth_date = pd.to_datetime(birthday)
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age

# Add age column into DataFrame
data['age'] = data['birthday'].apply(calculate_age)

# Remove birthday column
data = data.drop(columns=['birthday'])

data.head()

Unnamed: 0,userID,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp,age
0,577980460,Female,22,Lẩu nấm thập cẩm,157000,6,5,1365811200,35
1,517030417,Female,29,Rau thêm,10000,8,5,1341100800,17
2,265129175,Female,2,Chả giò,80000,1,1,1367193600,10
3,631007419,Male,2,Chả giò,80000,1,3,1374451200,24
4,484119129,Female,2,Chả giò,80000,1,1,1334707200,47


In [5]:
featuresTrain = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID', 'timestamp']
data = data[featuresTrain]
data.head()

Unnamed: 0,userID,itemID,gender,age,itemPrice,rating,categoryID,timestamp
0,577980460,22,Female,35,157000,5,6,1365811200
1,517030417,29,Female,17,10000,5,8,1341100800
2,265129175,2,Female,10,80000,1,1,1367193600
3,631007419,2,Male,24,80000,3,1,1374451200
4,484119129,2,Female,47,80000,1,1,1334707200


In [6]:
train, test = python_stratified_split(data, ratio=0.8)
print(train)

          userID  itemID  gender  age  itemPrice  rating  categoryID  \
141   0100130890      14  Female   13      45000       2           4   
860   0100657606      12  Female   51      70000       5           3   
1918  0101089548      37  Female   43      30000       3          11   
566   0101089548      12  Female   43      70000       5           3   
776   0101872029      12    Male   41      70000       5           3   
...          ...     ...     ...  ...        ...     ...         ...   
850   0999258356      12    Male   23      70000       5           3   
861   0999258356      12    Male   23      70000       5           3   
792   0999750397      12    Male   49      70000       4           3   
1253  0999823778      34    Male   26      15000       1          10   
1138  0999823778      12    Male   26      70000       5           3   

       timestamp  
141   1382832000  
860   1387670400  
1918  1293667200  
566   1404604800  
776   1353542400  
...          ...  
85

In [7]:
SEED = DEFAULT_SEED  # Set None for non-deterministic results

data1 = ImplicitCF(train=train, test=test, seed=SEED)

In [8]:
yaml_file = "./lightgcn.yaml"
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 50
# BATCH_SIZE = 1024
BATCH_SIZE = 2048

# hparams = prepare_hparams(yaml_file,
#                           n_layers=3,
#                           batch_size=BATCH_SIZE,
#                           epochs=EPOCHS,
#                           learning_rate=0.005,
#                           eval_epoch=5,
#                           top_k=TOP_K,
#                          )
hparams = prepare_hparams(yaml_file,
                          n_layers=4,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.01,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [9]:
model = LightGCN(hparams, data1, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [19]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)0.0s: train loss = 0.04113 = (mf)0.04007 + (embed)0.00106
Epoch 2 (train)0.0s: train loss = 0.04334 = (mf)0.04227 + (embed)0.00107
Epoch 3 (train)0.0s: train loss = 0.03529 = (mf)0.03419 + (embed)0.00110
Epoch 4 (train)0.0s: train loss = 0.04341 = (mf)0.04232 + (embed)0.00110
Epoch 5 (train)0.0s + (eval)0.1s: train loss = 0.02997 = (mf)0.02886 + (embed)0.00111, recall = 0.37413, ndcg = 0.29052, precision = 0.03741, map = 0.26454
Epoch 6 (train)0.0s: train loss = 0.03110 = (mf)0.02996 + (embed)0.00114
Epoch 7 (train)0.0s: train loss = 0.02990 = (mf)0.02875 + (embed)0.00114
Epoch 8 (train)0.0s: train loss = 0.02740 = (mf)0.02624 + (embed)0.00116
Epoch 9 (train)0.0s: train loss = 0.03635 = (mf)0.03519 + (embed)0.00117
Epoch 10 (train)0.0s + (eval)0.1s: train loss = 0.02309 = (mf)0.02190 + (embed)0.00118, recall = 0.37063, ndcg = 0.28596, precision = 0.03706, map = 0.25988
Epoch 11 (train)0.0s: train loss = 0.03016 = (mf)0.02899 + (embed)0.00117
Epoch 12 (train)0.0s: train l

In [11]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head(10)

Unnamed: 0,userID,itemID,prediction
0,105289402,9,3.875465
1,105289402,4,1.751545
2,105289402,25,1.407773
3,105289402,7,1.132137
4,105289402,34,0.943301
5,105289402,17,0.930095
6,105289402,19,0.747171
7,105289402,10,0.493352
8,105289402,11,0.481887
9,105289402,13,0.3806


In [20]:
eval_map = map(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.275654
NDCG:	0.299983
Precision@K:	0.037762
Recall@K:	0.377622


In [None]:
# Record results for tests - ignore this cell
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)

In [18]:
def recommend_items (user_id):
  return topk_scores[topk_scores['userID'] == user_id]['itemID'].values # Return type is array

recommend_items_for_user = recommend_items('0105289402')
print(recommend_items_for_user)
print(recommend_items_for_user[0])

[ 9  4 25  7 34 17 19 10 11 13]
9


## LightGBM
Content based filtering

In [None]:
pip install

In [21]:
import os
import sys
import numpy as np
import lightgbm as lgb
import pandas as pd
import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss

import recommenders.datasets.criteo as criteo
import recommenders.models.lightgbm.lightgbm_utils as lgb_utils
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("LightGBM version: {}".format(lgb.__version__))

System version: 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0]
LightGBM version: 4.1.0


In [111]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# read dataset
data = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
from datetime import datetime

def calculate_age(birthday):
    today = datetime.today()
    birth_date = pd.to_datetime(birthday)
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age

# Add age column into DataFrame
data['age'] = data['birthday'].apply(calculate_age)

# Remove birthday column
data = data.drop(columns=['birthday'])

featuresTrain = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID']
data = data[featuresTrain]
# data.head()

# user_features = data[['userID', 'gender', 'age']]
# item_features = data[['itemID', 'itemPrice', 'categoryID']]
# interactions = data[['userID', 'itemID', 'rating']]
# # print(interactions)

# # Merge user and item features
# interactions = interactions.merge(user_features, on='userID')
# interactions = interactions.merge(item_features, on='itemID')

# # print(interactions)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(train_data)

          userID  itemID  gender  age  itemPrice  rating  categoryID
968   0435321829      12  Female   40      70000       5           3
240   0497848626      12  Female   41      70000       5           3
819   0597552837      12    Male   39      70000       5           3
692   0969163172      12    Male   53      70000       3           3
420   0580121827      12  Female   37      70000       5           3
...          ...     ...     ...  ...        ...     ...         ...
1130  0316499354      12    Male   40      70000       5           3
1294  0320477835      25    Male   30      82000       5           7
860   0100657606      12  Female   51      70000       5           3
1459  0871237272      37  Female   22      30000       4          11
1126  0259395934      12  Female   16      70000       5           3

[1600 rows x 7 columns]


In [112]:
# Tạo các feature và label cho mô hình
X_train = train_data.drop(['rating'], axis=1)
y_train = train_data['rating']
X_test = test_data.drop(['rating'], axis=1)
y_test = test_data['rating']

# Chuyển đổi cột gender trong tập huấn luyện
X_train['gender'] = pd.factorize(X_train['gender'])[0]
X_train['userID'] = X_train['userID'].astype(int)

# Chuyển đổi cột gender trong tập test
X_test['gender'] = pd.factorize(X_test['gender'])[0]
X_test['userID'] = X_test['userID'].astype(int)

# Tạo dataset cho LightGBM
train_dataset = lgb.Dataset(X_train, label=y_train)
test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)

# Định nghĩa các tham số của mô hình
params = {
    'objective': 'regression',
    'metric': 'rmse'
}
# MAX_LEAF = 64
# MIN_DATA = 20
# NUM_OF_TREES = 100
# TREE_LEARNING_RATE = 0.15
# EARLY_STOPPING_ROUNDS = 20
# METRIC = "auc"
# SIZE = "sample"
# params = {
#     "task": "train",
#     "boosting_type": "gbdt",
#     "num_class": 1,
#     "objective": "binary",
#     "metric": METRIC,
#     "num_leaves": MAX_LEAF,
#     "min_data": MIN_DATA,
#     "boost_from_average": True,
#     # set it according to your cpu cores.
#     "num_threads": 20,
#     "feature_fraction": 0.8,
#     "learning_rate": TREE_LEARNING_RATE,
# }

# Huấn luyện mô hình
model = lgb.train(params, train_dataset, valid_sets=[test_dataset], callbacks=[lgb.early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 367
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 6
[LightGBM] [Info] Start training from score 4.151250
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's rmse: 1.30004


In [113]:
# Dự đoán trên tập test
y_pred = model.predict(X_test)
# print(y_pred)

# Đánh giá mô hình
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# pred = model.predict('')

RMSE: 1.3000404196818385


In [117]:
user_id_predict = '0435321829'
user_gender = 'Female'
user_age_predict = 25

user_id_predict = int(user_id_predict)
user_gender_predict = 0 if user_gender == 'Female' else 1

df_item_predict = pd.read_json('./data_item.json').drop_duplicates(subset='_id')
df_item_predict = df_item_predict[['_id', 'price', 'category_id']]
df_item_predict.columns = ['itemID', 'itemPrice', 'categoryID']
df_item_predict_length = df_item_predict.shape[0]

df_user_predict = pd.DataFrame({
    'userID': [user_id_predict] * df_item_predict_length,  # Tạo 1000 hàng với cùng một user
    'gender': [user_gender_predict] * df_item_predict_length,  # Thông tin về user
    'age': [user_age_predict] * df_item_predict_length,  # Thông tin về user
})

predict_data = pd.concat([df_user_predict, df_item_predict], axis=1)

# Dự đoán điểm số cho các item
predict_data_result = model.predict(predict_data)

# Thêm cột dự đoán vào dataframe
predict_data['predicted_score'] = predict_data_result

recommended_items = predict_data.sort_values(by='predicted_score', ascending=False)
TOP = 10
top_N_recommendations = recommended_items.head(TOP)
print(top_N_recommendations[['itemID', 'predicted_score']])

    itemID  predicted_score
28      29         4.217958
27      28         4.217958
26      27         4.217958
25      26         4.217958
24      25         4.205858
21      22         4.205858
22      23         4.205858
11      12         4.177599
19      20         4.162915
18      19         4.162915
itemID  predicted_score
12      4.177599           1
19      4.162915           1
20      4.162915           1
22      4.205858           1
23      4.205858           1
25      4.205858           1
26      4.217958           1
27      4.217958           1
28      4.217958           1
29      4.217958           1
Name: count, dtype: int64
