In [None]:
pip install lightgbm pandas scikit-learn



In [5]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [6]:
# Step 1: Load the MovieLens dataset
# You can download the dataset from https://grouplens.org/datasets/movielens/
# Here we use the smaller dataset "ml-latest-small" as an example
ratings = pd.read_csv("ratings.csv")  # Replace with the actual path to your data
movies = pd.read_csv("movies.csv")  # Replace with the actual path to your data

In [7]:
# Merge datasets to include movie titles (optional)
data = pd.merge(ratings, movies, on="movieId")

In [8]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [9]:
# Step 2: Feature engineering
# Create user and movie-specific features
data['user_mean_rating'] = data.groupby('userId')['rating'].transform('mean')
data['movie_mean_rating'] = data.groupby('movieId')['rating'].transform('mean')
data['user_movie_count'] = data.groupby(['userId', 'movieId'])['rating'].transform('count')

In [10]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,movie_mean_rating,user_movie_count
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.366379,3.92093,1
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,4.366379,3.259615,1
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,4.366379,3.946078,1
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.366379,3.975369,1
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.366379,4.237745,1


In [11]:
# Map float ratings to discrete relevance levels
def map_rating_to_relevance(rating):
    if rating >= 4.5:
        return 3  # Most relevant
    elif rating >= 3.0:
        return 2  # Moderately relevant
    else:
        return 1  # Least relevant

data['relevance'] = data['rating'].apply(map_rating_to_relevance)

In [12]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,movie_mean_rating,user_movie_count,relevance
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.366379,3.92093,1,2
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,4.366379,3.259615,1,2
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,4.366379,3.946078,1,2
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.366379,3.975369,1,3
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.366379,4.237745,1,3


In [13]:
# Select features for the model
features = ['user_mean_rating', 'movie_mean_rating', 'user_movie_count']

In [14]:
# Step 3: Prepare training data
X = data[features]
y = data['rating']  # Use ratings as relevance scores
groups = data.groupby('userId').size().tolist()  # Group data by user

In [15]:
# Split data while preserving groups
user_ids = data['userId'].unique()
train_users, test_users = train_test_split(user_ids, test_size=0.3, random_state=42)

In [16]:
train_data = data[data['userId'].isin(train_users)]
test_data = data[data['userId'].isin(test_users)]

In [17]:
# Prepare LightGBM datasets
X_train = train_data[features]
y_train = train_data['relevance']
groups_train = train_data.groupby('userId').size().tolist()

In [18]:
X_test = test_data[features]
y_test = test_data['relevance']
groups_test = test_data.groupby('userId').size().tolist()

In [19]:
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_test = lgb.Dataset(X_test, label=y_test, group=groups_test, reference=lgb_train)

In [20]:
# Train LightGBM model
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'label_gain': [0, 1, 2, 3],  # Define gains for each relevance level
    'learning_rate': 0.1,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_test],
    valid_names=['train', 'valid'],
    num_boost_round=100,
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=10),
    ],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 64241, number of used features: 2
Training until validation scores don't improve for 10 rounds
[10]	train's ndcg@1: 0.904762	train's ndcg@2: 0.896023	train's ndcg@3: 0.884627	train's ndcg@4: 0.882377	train's ndcg@5: 0.880661	valid's ndcg@1: 0.887067	valid's ndcg@2: 0.870882	valid's ndcg@3: 0.866126	valid's ndcg@4: 0.861754	valid's ndcg@5: 0.859591
Early stopping, best iteration is:
[7]	train's ndcg@1: 0.900859	train's ndcg@2: 0.891515	train's ndcg@3: 0.88154	train's ndcg@4: 0.879257	train's ndcg@5: 0.877468	valid's ndcg@1: 0.887067	valid's ndcg@2: 0.872996	valid's ndcg@3: 0.866071	valid's ndcg@4: 0.861991	valid's ndcg@5: 0.860288


In [21]:
# Evaluate the model
y_pred = model.predict(X_test)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG Score: {ndcg:.4f}")

NDCG Score: 0.9864


In [22]:
# Recommend for a specific user
user_id = 1
user_data = test_data[test_data['userId'] == user_id]
user_features = user_data[features]

In [23]:
# Predict relevance scores
user_data['predicted_relevance'] = model.predict(user_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['predicted_relevance'] = model.predict(user_features)


In [24]:
# Sort recommendations
recommendations = user_data.sort_values('predicted_relevance', ascending=False)

In [25]:
# Display top recommendations
print(recommendations[['movieId', 'title', 'predicted_relevance']].head())

     movieId                                  title  predicted_relevance
114     1927  All Quiet on the Western Front (1930)             0.597185
192     2959                      Fight Club (1999)             0.436685
36       608                           Fargo (1996)             0.350804
203     3147                 Green Mile, The (1999)             0.321890
20       356                    Forrest Gump (1994)             0.321890
