In [1]:
import pandas as pd
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from src.utils import train_test_split
from xgboost import XGBRanker

Let's load the datasets with users info, movies info and users' ratings for movies.

Then we split it to training/test subsets by the timestamp.

In [2]:
users = pd.read_table("../data/users.dat", sep="::", names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

movies = pd.read_table("../data/movies.dat", sep="::", names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

ratings = pd.read_table("../data/ratings.dat", sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit='s')

data_df = pd.merge(ratings, users, on='UserID')
data_df = pd.merge(data_df, movies, on='MovieID')

train_data, test_data = train_test_split(data_df, 'Timestamp')

X_train = train_data[['UserID', 'Gender', 'Age', 'Occupation', 'Genres', 'MovieID']]
X_test = test_data[['UserID', 'Gender', 'Age', 'Occupation', 'Genres', 'MovieID']]
y_train = train_data['Rating']
y_test = test_data['Rating']

Then let's do some data preparation.

In [3]:
categorical_features = ['Gender', 'Occupation', 'Genres']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

group_train = X_train.groupby('UserID').size().to_numpy()
group_test = X_test.groupby('UserID').size().to_numpy()

Now, we are going to train our learning-to-rate xgboost model with NDCG loss function, which predicts ratings based on the movie features.

In [4]:
ranker = XGBRanker(
    objective='rank:ndcg',
    booster='gbtree',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.8
)

ranker.fit(X_train_transformed, y_train, group=group_train)

y_pred = ranker.predict(X_test_transformed)

As we boost NDCG metric, let's check this score provided by our model.

In [5]:
ndcg = ndcg_score([y_test], [y_pred], k=10)

print(f'NDCG Score: {ndcg}')

NDCG Score: 0.7196519721439243


NDCG measures the quality of the recommendations by considering the position of the relevant items in the list, with higher-ranked items contributing more to the score.

So, here we get quite high NDCG score, higher than all previous ones.