In [1]:
import pandas as pd
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from src.utils import train_test_split
from src.models.ndcg_boost import NDCGLossGradientBoostingRegressor

Let's load the datasets with users info, movies info and users' ratings for movies.

Then we split it to training/test subsets by the timestamp.

In [2]:
users = pd.read_table("../data/users.dat", sep="::", names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

movies = pd.read_table("../data/movies.dat", sep="::", names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

ratings = pd.read_table("../data/ratings.dat", sep="::", names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit='s')

data_df = pd.merge(ratings, users, on='UserID')
data_df = pd.merge(data_df, movies, on='MovieID')

train_data, test_data = train_test_split(data_df, 'Timestamp')

X_train = train_data[['Gender', 'Age', 'Occupation', 'Genres']]
X_test = test_data[['Gender', 'Age', 'Occupation', 'Genres']]
y_train = train_data['Rating']
y_test = test_data['Rating']

Then let's do some data preparation.

In [3]:
categorical_features = ['Gender', 'Occupation', 'Genres']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

Now, we are going to train our learning-to-rate gradient boost model with NDCG loss function, which predicts ratings based on the movie features.

In [4]:
model = NDCGLossGradientBoostingRegressor(learning_rate=0.05, n_estimators=100, max_depth=3)
model.fit(X_train_transformed, y_train)

y_pred = model.predict(X_test_transformed)

As we boost NDCG metric, let's check this score provided by our model.

In [5]:
ndcg = ndcg_score([y_test], [y_pred], k=10)

print(f'NDCG Score: {ndcg}')

NDCG Score: 0.8320000000000001


NDCG measures the quality of the recommendations by considering the position of the relevant items in the list, with higher-ranked items contributing more to the score.

So, here we get the highest NDCG score within all the previous models, even higher than provided one by xgboost.