# Training Machine Learning with Some Models
Hybrid Recommend Technique with Collaborative Filtering and Content-Based Filtering for Improved Recommender System

## Install and import essential libraries

In [1]:
pip install recommenders pandera Flask

Collecting recommenders
  Downloading recommenders-1.2.0-py3-none-any.whl.metadata (13 kB)
Collecting pandera
  Downloading pandera-0.20.4-py3-none-any.whl.metadata (15 kB)
Collecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting cornac<2,>=1.15.2 (from recommenders)
  Downloading cornac-1.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (23 kB)
Collecting lightfm<2,>=1.17 (from recommenders)
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting locust<3,>=2.12.2 (from recommenders)
  Downloading locust-2.31.8-py3-none-any.whl.metadata (7.7 kB)
Collecting memory-profiler<1,>=0.61.0 (from recommenders)
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting notebook<8,>=7.0.0 (from recommenders)


In [2]:
# import essential libraries
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import lightgbm as lgb
import category_encoders as ce
import recommenders.models.lightgbm.lightgbm_utils as lgb_utils
import lightgbm as lgb
import math
tf.get_logger().setLevel('ERROR') # only show error messages
from sklearn.metrics import mean_squared_error
from datetime import datetime
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print("LightGBM version: {}".format(lgb.__version__))
print(f"Tensorflow version: {tf.__version__}")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



System version: 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
Pandas version: 2.2.2
LightGBM version: 4.5.0
Tensorflow version: 2.17.0


## Read dataset

In [3]:
dataset = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
dataset.head()

Unnamed: 0,userID,birthday,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp
0,577980460,1988-12-17,Female,22,Lẩu nấm thập cẩm,157000,6,5.0,1365811200
1,517030417,,Female,29,Rau thêm,10000,8,,1341100800
2,265129175,2014-01-24,Female,2,Chả giò,80000,1,,1367193600
3,631007419,2000-07-23,Male,2,Chả giò,80000,1,3.0,1374451200
4,484119129,1977-05-22,Female,2,Chả giò,80000,1,1.0,1334707200


## Data Preprocessing

In [4]:
def calculate_age(birthday):
    today = datetime.today()
    birth_date = pd.to_datetime(birthday)
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age

# Add age column into DataFrame
if 'birthday' in dataset.columns:
  dataset['age'] = dataset['birthday'].apply(calculate_age)

# Remove birthday column
dataset = dataset.drop(columns=['birthday'], errors='ignore')

dataset.head()

Unnamed: 0,userID,gender,itemID,itemName,itemPrice,categoryID,rating,timestamp,age
0,577980460,Female,22,Lẩu nấm thập cẩm,157000,6,5.0,1365811200,35.0
1,517030417,Female,29,Rau thêm,10000,8,,1341100800,
2,265129175,Female,2,Chả giò,80000,1,,1367193600,10.0
3,631007419,Male,2,Chả giò,80000,1,3.0,1374451200,24.0
4,484119129,Female,2,Chả giò,80000,1,1.0,1334707200,47.0


In [5]:
# Encode Categorical Data
dataset['gender'] = dataset['gender'].map({'Female': 0, 'Male': 1})

In [6]:
# Data missing handler
dataset['age'].fillna(round(dataset['age'].mean()), inplace=True)
dataset['age'] = dataset['age'].astype(int) # Convert integer type

# Not every user will rate the dish after order, so it is necessary to handle missing rating data

num_missing_ratings = dataset['rating'].isnull().sum()
if num_missing_ratings > 0:
  from sklearn.impute import KNNImputer
  # Replace the missing value with the average value of that dishes
  feature_user_item = dataset[['userID', 'itemID', 'age', 'gender', 'categoryID', 'itemPrice', 'rating']]
  # For each missing value, KNNImputer finds the k nearest neighbors and calculates a replacement value based on the values ​​of those neighbors. If all user and item features are included, KNNImputer will rely on these characteristics to calculate the distance and fill in missing values.
  imputer = KNNImputer(n_neighbors=5) # Create a KNNImputer object with k = 5
  data_imputed = imputer.fit_transform(feature_user_item) # Apply KNNImputer
  dataset['rating'] = pd.DataFrame(data_imputed)[[6]] # 6 is column's name (as well as index column)
  dataset['rating'] = dataset['rating'].astype(int) # Convert integer type

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['age'].fillna(round(dataset['age'].mean()), inplace=True)


In [7]:
# Feature selection
featuresUsed = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID', 'timestamp']
dataset = dataset[featuresUsed]
dataset.head()
#Feature scaling: no need

Unnamed: 0,userID,itemID,gender,age,itemPrice,rating,categoryID,timestamp
0,577980460,22,0,35,157000,5,6,1365811200
1,517030417,29,0,31,10000,3,8,1341100800
2,265129175,2,0,10,80000,3,1,1367193600
3,631007419,2,1,24,80000,3,1,1374451200
4,484119129,2,0,47,80000,1,1,1334707200


In [8]:
# Split dataset
train, test = python_stratified_split(dataset, ratio=0.8)
# print(train)

## Training LightGCN Model

In [16]:
TOP_K = 10 # top k items to recommend

def Setup_LightGCN (train, test):
  SEED = DEFAULT_SEED  # Set None for non-deterministic results
  data = ImplicitCF(train=train, test=test, seed=SEED)
  yaml_file = "./lightgcn.yaml"
  # Model parameters
  EPOCHS = 50
  BATCH_SIZE = 2048
  hparams = prepare_hparams(yaml_file,
                          n_layers=4,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.01,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )
  modelLightGCN = LightGCN(hparams, data, seed=SEED)
  return modelLightGCN

def Training_LightGCN(modelLightGCN):
  with Timer() as train_time:
    modelLightGCN.fit()
  print("Took {} seconds for training.".format(train_time.interval))
  topk_scores = modelLightGCN.recommend_k_items(test, top_k=TOP_K, remove_seen=False) # Remove seen is param to decide hidden item which user have seen
  # print(topk_scores)
  return topk_scores, modelLightGCN

def predict_LightGCN (user_id_predict): # for combine 2 models
  try:
    predict_data = pd.DataFrame({
      'userID': [user_id_predict]
    })
    # predict score for items
    result = modelLightGCN.recommend_k_items(predict_data, top_k=TOP_K, remove_seen=False)
    return result.drop(columns=['userID']) # Drop column to have the same size columns with LightGBM
  except Exception as e:
    print(str(e))
    # return pd.DataFrame(columns = ['itemID', 'prediction'])
    top_10_items = dataset.groupby('itemID')['rating'].mean().nlargest(10).reset_index()
    top_10_items.columns = ['itemID', 'prediction']
    return top_10_items

# Call train, evaluate functions
modelLightGCN = Setup_LightGCN(train, test)
topk_scores, modelLightGCN = Training_LightGCN(modelLightGCN)
#print(predict_LightGCN ('0105289402')) #0435321829 #0111111

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)0.1s: train loss = 0.68599 = (mf)0.68585 + (embed)0.00014
Epoch 2 (train)0.0s: train loss = 0.66762 = (mf)0.66747 + (embed)0.00015
Epoch 3 (train)0.0s: train loss = 0.63605 = (mf)0.63589 + (embed)0.00016
Epoch 4 (train)0.0s: train loss = 0.59065 = (mf)0.59047 + (embed)0.00018


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 5 (train)0.0s + (eval)0.2s: train loss = 0.53327 = (mf)0.53308 + (embed)0.00019, recall = 0.39860, ndcg = 0.32126, precision = 0.03986, map = 0.29727
Epoch 6 (train)0.0s: train loss = 0.46883 = (mf)0.46862 + (embed)0.00020
Epoch 7 (train)0.0s: train loss = 0.40443 = (mf)0.40421 + (embed)0.00022
Epoch 8 (train)0.0s: train loss = 0.33921 = (mf)0.33897 + (embed)0.00024
Epoch 9 (train)0.0s: train loss = 0.27993 = (mf)0.27967 + (embed)0.00026
Epoch 10 (train)0.0s + (eval)0.1s: train loss = 0.24131 = (mf)0.24103 + (embed)0.00028, recall = 0.39860, ndcg = 0.32360, precision = 0.03986, map = 0.30005
Epoch 11 (train)0.0s: train loss = 0.19983 = (mf)0.19953 + (embed)0.00030
Epoch 12 (train)0.0s: train loss = 0.17293 = (mf)0.17260 + (embed)0.00033
Epoch 13 (train)0.0s: train loss = 0.15571 = (mf)0.15536 + (embed)0.00035
Epoch 14 (train)0.0s: train loss = 0.14834 = (mf)0.14796 + (embed)0.00038


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 15 (train)0.0s + (eval)0.1s: train loss = 0.12322 = (mf)0.12282 + (embed)0.00040, recall = 0.40210, ndcg = 0.32460, precision = 0.04021, map = 0.30042
Epoch 16 (train)0.0s: train loss = 0.10951 = (mf)0.10909 + (embed)0.00043
Epoch 17 (train)0.0s: train loss = 0.11272 = (mf)0.11226 + (embed)0.00045
Epoch 18 (train)0.0s: train loss = 0.11214 = (mf)0.11166 + (embed)0.00048
Epoch 19 (train)0.0s: train loss = 0.11652 = (mf)0.11601 + (embed)0.00050
Epoch 20 (train)0.0s + (eval)0.1s: train loss = 0.11679 = (mf)0.11626 + (embed)0.00053, recall = 0.40210, ndcg = 0.32455, precision = 0.04021, map = 0.30038
Epoch 21 (train)0.0s: train loss = 0.09676 = (mf)0.09621 + (embed)0.00055
Epoch 22 (train)0.0s: train loss = 0.08030 = (mf)0.07972 + (embed)0.00058
Epoch 23 (train)0.0s: train loss = 0.09795 = (mf)0.09735 + (embed)0.00060
Epoch 24 (train)0.0s: train loss = 0.11035 = (mf)0.10973 + (embed)0.00063


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 25 (train)0.0s + (eval)0.1s: train loss = 0.09752 = (mf)0.09688 + (embed)0.00064, recall = 0.40210, ndcg = 0.32324, precision = 0.04021, map = 0.29862
Epoch 26 (train)0.0s: train loss = 0.08586 = (mf)0.08519 + (embed)0.00067
Epoch 27 (train)0.0s: train loss = 0.11335 = (mf)0.11266 + (embed)0.00069
Epoch 28 (train)0.0s: train loss = 0.08143 = (mf)0.08072 + (embed)0.00071
Epoch 29 (train)0.0s: train loss = 0.09189 = (mf)0.09116 + (embed)0.00073
Epoch 30 (train)0.0s + (eval)0.0s: train loss = 0.09767 = (mf)0.09692 + (embed)0.00075, recall = 0.39510, ndcg = 0.31592, precision = 0.03951, map = 0.29119
Epoch 31 (train)0.0s: train loss = 0.05380 = (mf)0.05303 + (embed)0.00077
Epoch 32 (train)0.0s: train loss = 0.06910 = (mf)0.06832 + (embed)0.00079
Epoch 33 (train)0.0s: train loss = 0.06809 = (mf)0.06728 + (embed)0.00081
Epoch 34 (train)0.0s: train loss = 0.06438 = (mf)0.06356 + (embed)0.00083


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 35 (train)0.0s + (eval)0.1s: train loss = 0.07312 = (mf)0.07228 + (embed)0.00084, recall = 0.39161, ndcg = 0.31324, precision = 0.03916, map = 0.28896
Epoch 36 (train)0.0s: train loss = 0.07829 = (mf)0.07743 + (embed)0.00086
Epoch 37 (train)0.0s: train loss = 0.06520 = (mf)0.06432 + (embed)0.00088
Epoch 38 (train)0.0s: train loss = 0.06999 = (mf)0.06910 + (embed)0.00089
Epoch 39 (train)0.0s: train loss = 0.07028 = (mf)0.06936 + (embed)0.00092
Epoch 40 (train)0.0s + (eval)0.1s: train loss = 0.05349 = (mf)0.05257 + (embed)0.00092, recall = 0.38462, ndcg = 0.30761, precision = 0.03846, map = 0.28346
Epoch 41 (train)0.0s: train loss = 0.04953 = (mf)0.04859 + (embed)0.00095
Epoch 42 (train)0.0s: train loss = 0.05174 = (mf)0.05079 + (embed)0.00095
Epoch 43 (train)0.0s: train loss = 0.05387 = (mf)0.05291 + (embed)0.00097
Epoch 44 (train)0.0s: train loss = 0.06462 = (mf)0.06364 + (embed)0.00099
Epoch 45 (train)0.0s + (eval)0.1s: train loss = 0.04660 = (mf)0.04558 + (embed)0.00101, recall

  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


In [None]:
def Evaluate_LightGCN(topk_scores):
  eval_map = map(test, topk_scores, k=TOP_K)
  eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
  eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
  eval_recall = recall_at_k(test, topk_scores, k=TOP_K)
  print("MAP:\t%f" % eval_map,
        "NDCG:\t%f" % eval_ndcg,
        "Precision@K:\t%f" % eval_precision,
        "Recall@K:\t%f" % eval_recall, sep='\n')

Evaluate_LightGCN(topk_scores)

## Training LightGBM Model

In [17]:
if 'timestamp' in train.columns:
  train.drop(columns=['timestamp'], inplace=True) # Not essential in this model, isn't feature of user or item
if 'timestamp' in test.columns:
  test.drop(columns=['timestamp'], inplace=True) # Not essential in this model, isn't feature of user or item
# dataset.head()
# print(train)
# print(test)

In [18]:
# Create feature and label for model
def Create_Feature_Label (train, test):
  X_train = train.drop(['rating'], axis=1)
  y_train = train['rating']
  X_test = test.drop(['rating'], axis=1)
  y_test = test['rating']

  # The model only accepts numbers, not strings or objects, so the data needs to be converted
  # Convert gender, userID columns in train data into number or float
  X_train['userID'] = X_train['userID'].astype(int)

  # Convert gender, userID columns in test data into number or float
  X_test['userID'] = X_test['userID'].astype(int)

  # Create dataset for LightGBM
  train_dataset = lgb.Dataset(X_train, label=y_train)
  test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)
  return train_dataset, test_dataset, X_test, y_test

train_dataset, test_dataset, X_test, y_test = Create_Feature_Label(train, test)

In [19]:
# print(train_dataset)
# print(test_dataset)

In [20]:
# Training LightGBM Model
params = {
    'objective': 'regression',
    'metric': 'rmse'
}

modelLightGBM = lgb.train(params, train_dataset, valid_sets=[test_dataset], callbacks=[lgb.early_stopping(10)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 1714, number of used features: 6
[LightGBM] [Info] Start training from score 4.151692
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[9]	valid_0's rmse: 1.35464


In [21]:
# Evaluation
def Evaluate_LightGBM(X_test, y_test):
  # predict on test data
  y_pred = modelLightGBM.predict(X_test)

  # Evaluatate model
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print(f'RMSE: {rmse}')

Evaluate_LightGBM(X_test, y_test)

RMSE: 1.354642829624231


In [22]:
# For combie two models
def predict_LightGBM (user_id_predict, user_gender, user_age_predict):
  user_id_predict = int(user_id_predict)
  user_gender_predict = 0 if user_gender == 'Female' else 1

  df_item_predict = pd.read_json('./data_item.json').drop_duplicates(subset='_id')
  df_item_predict = df_item_predict[['_id', 'price', 'category_id']]
  df_item_predict.columns = ['itemID', 'itemPrice', 'categoryID']
  df_item_predict_length = df_item_predict.shape[0]

  df_user_predict = pd.DataFrame({
      'userID': [user_id_predict] * df_item_predict_length,  # Create n rows with the same user
      'gender': [user_gender_predict] * df_item_predict_length,  # user info
      'age': [user_age_predict] * df_item_predict_length,  # user info
  })

  predict_data = pd.concat([df_user_predict, df_item_predict], axis=1)

  # predict score for items
  predict_data_result = modelLightGBM.predict(predict_data)

  # add prediction columns into df
  predict_data['prediction_lightgbm'] = predict_data_result

  recommended_items = predict_data.sort_values(by='prediction_lightgbm', ascending=False)
  top_N_recommendations = recommended_items.head(TOP_K)
  return top_N_recommendations[['itemID', 'prediction_lightgbm']]

## Combine two models and API integration

In [23]:
def recommend_items(user_id_predict, user_gender, user_age_predict):
  recommend_items_LightGBM = predict_LightGBM(user_id_predict, user_gender, user_age_predict) # print(recommend_items_LightGBM)
  recommend_items_LightGCN = predict_LightGCN(user_id_predict) # print(recommend_items_LightGCN)
  df_combined = pd.merge(recommend_items_LightGCN, recommend_items_LightGBM, on=['itemID'], how='outer')
  df_combined['prediction'].fillna(0, inplace=True)
  df_combined['prediction_lightgbm'].fillna(0, inplace=True)
  df_combined['predicted_score_avg'] = (df_combined['prediction_lightgbm'] + df_combined['prediction']) / 2
  print(df_combined)
  df_combined_sorted = df_combined.sort_values(by='predicted_score_avg', ascending=False) # Sort by predict_score
  # print(df_combined_sorted)
  top_n_items = df_combined_sorted.head(TOP_K) # Choose top-N item to recommend
  return top_n_items['itemID'].values.tolist()

# Testing
# print(recommend_items('0105289402', 'Female', 25)) # 0435321829

In [25]:
# API integration
from flask import Flask, jsonify, request
import urllib.parse
import json
import random

app = Flask(__name__)

def generate_random_phone_number():
    return f'0{random.randint(100000000, 999999999)}'

def retrain_models():
    dataset = pd.read_csv('./dataset.csv', encoding='utf-8', dtype={'userID': str}, sep=",")
    # Add age column into DataFrame
    if 'birthday' in dataset.columns:
      dataset['age'] = dataset['birthday'].apply(calculate_age)
    # Encode Categorical Data
    dataset['gender'] = dataset['gender'].map({'Female': 0, 'Male': 1})
    # Data missing handler
    dataset['age'].fillna(round(dataset['age'].mean()), inplace=True)
    dataset['age'] = dataset['age'].astype(int) # Convert integer type

    # Not every user will rate the dish after order, so it is necessary to handle missing rating data

    num_missing_ratings = dataset['rating'].isnull().sum()
    if num_missing_ratings > 0:
      from sklearn.impute import KNNImputer
      # Replace the missing value with the average value of that dishes
      feature_user_item = dataset[['userID', 'itemID', 'age', 'gender', 'categoryID', 'itemPrice', 'rating']]
      # For each missing value, KNNImputer finds the k nearest neighbors and calculates a replacement value based on the values ​​of those neighbors. If all user and item features are included, KNNImputer will rely on these characteristics to calculate the distance and fill in missing values.
      imputer = KNNImputer(n_neighbors=5) # Create a KNNImputer object with k = 5
      data_imputed = imputer.fit_transform(feature_user_item) # Apply KNNImputer
      dataset['rating'] = pd.DataFrame(data_imputed)[[6]] # 6 is column's name (as well as index column)
      dataset['rating'] = dataset['rating'].astype(int) # Convert integer type

    # Remove birthday column
    dataset = dataset.drop(columns=['birthday'], errors='ignore')
    # Feature selection
    featuresUsed = ['userID', 'itemID', 'gender', 'age', 'itemPrice', 'rating', 'categoryID', 'timestamp']
    dataset = dataset[featuresUsed]
    # Split dataset
    train, test = python_stratified_split(dataset, ratio=0.8)
    # Call train, evaluate functions
    modelLightGCN = Setup_LightGCN(train, test)
    topk_scores, modelLightGCN = Training_LightGCN(modelLightGCN)
    #print(predict_LightGCN ('0105289402')) #0435321829 #0111111
    if 'timestamp' in train.columns:
      train.drop(columns=['timestamp'], inplace=True) # Not essential in this model, isn't feature of user or item
    if 'timestamp' in test.columns:
      test.drop(columns=['timestamp'], inplace=True) # Not essential in this model, isn't feature of user or item
    train_dataset, test_dataset, X_test, y_test = Create_Feature_Label(train, test)
    # Training LightGBM Model
    params = {
        'objective': 'regression',
        'metric': 'rmse'
    }

    modelLightGBM = lgb.train(params, train_dataset, valid_sets=[test_dataset], callbacks=[lgb.early_stopping(10)])

@app.route('/retrain', methods=['POST'])
def upload_csv_json():
  try:
    required_fields = ['menus', 'reviews', 'users']
    for field_name in required_fields:
      if field_name not in request.files:
        return jsonify({"error": f"Can not find '{field_name}' in request"}), 400

    menus_file = request.files['menus']
    menus = json.load(menus_file.stream)
    users_file = request.files['users']
    users = pd.read_csv(users_file.stream)
    reviews_file = request.files['reviews']
    reviews = pd.read_csv(reviews_file.stream)
    menus_df = pd.DataFrame(menus) #Rename menus list
    users.rename(columns={"_id": "userID", "birthday": "birthday", "gender": "gender"}, inplace=True) #Rename columns
    reviews.rename(columns={"user": "userID", "item": "itemID"}, inplace=True)
    merged_df = reviews.merge(users, on='userID', how='left').merge(menus_df, on='itemID', how='left') #Merge dataframes to create dataset
    final_df = merged_df[['userID', 'birthday', 'gender', 'itemID', 'name', 'price', 'category_id', 'rating', 'timestamp']] #Select essential columns
    final_df.rename(columns={"name": "itemName", "price": "itemPrice", "category_id": "categoryID"}, inplace=True) #Rename column
    final_df.to_csv('dataset.csv', index=False) #Export CSV
    retrain_models()
    return jsonify({"message": "Retrain successfully"}), 200
  except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route('/recommend/menu', methods=['GET'])
def handler_recommend_api():
  try:
    item = ''
    header_info = request.headers.get('user-infor-header')
    if header_info is None:
      items = recommend_items(generate_random_phone_number(), random.choice(['Male', 'Female']), random.randint(5, 50))
    else:
      decoded_info = urllib.parse.unquote(header_info)
      user = json.loads(decoded_info)
      if 'birthday' in user:
        age = calculate_age(user['birthday'])
        items = recommend_items(user['_id'], user['gender'], age)
      else:
        items = recommend_items(user['_id'], user['gender'], random.randint(5, 50))
    return jsonify({
        'status': 'success',
        'message': 'Lấy danh sách đề xuất món ăn thành công',
        'data': json.dumps(items)
    }), 200
  except Exception as e:
    return jsonify({
        'status': 'error',
        'message': str(e)
    }), 500

@app.errorhandler(404)
def page_not_found(e):
  return jsonify({
    'status': 'error',
    'message': 'Không tìm thấy đường dẫn'
  }), 404

# Run server
if __name__ == '__main__':
    app.run(host='0.0.0.0', debug=True, port=5090, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5090
 * Running on http://172.28.0.12:5090
INFO:werkzeug:[33mPress CTRL+C to quit[0m
