# Cargar datos

In [None]:
import os

file_name = 'ml-100k.zip'
if not os.path.exists(file_name):
    !wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
else:
    print(f"{file_name} already exists.")

--2025-10-02 18:21:09--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.grouplens.org/datasets/movielens/ml-100k.zip [following]
--2025-10-02 18:21:10--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-10-02 18:21:12 (2.73 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [None]:
import zipfile

zip_file_path = 'ml-100k.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd

ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Cargar generos

In [None]:
movie_cols = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL',
              'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy',
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
              'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', names=movie_cols)


item_genre_dict = {}
genre_cols = movie_cols[5:]

for index, row in movies_df.iterrows():
    item_id = row['item_id']
    genres = [genre for genre in genre_cols if row[genre] == 1]
    item_genre_dict[item_id] = genres


Crear split entrenamiento, validación y test

In [None]:
from sklearn.model_selection import train_test_split

uir_df = ratings_df[['user_id', 'item_id', 'rating']]

train_df, test_df = train_test_split(uir_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

Training set size: 60000
Validation set size: 20000
Test set size: 20000


# Instalar librerias

In [None]:
!pip3 install implicit

Collecting implicit
  Downloading implicit-0.7.2.tar.gz (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10855814 sha256=bc1cdf0b917e9adb20e4aa4c99f4efabf47e89d6f456ba79a1f4aaa192007385
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
import implicit
import time

# Crear datos de relevacia e interacción


In [None]:
train_df_relevance = train_df.copy()
val_df_relevance = val_df.copy()
test_df_relevance = test_df.copy()

train_df_relevance['rating'] = [1 if x >= 3 else 0 for x in train_df_relevance['rating']]
val_df_relevance['rating'] = [1 if x >= 3 else 0 for x in val_df_relevance['rating']]
test_df_relevance['rating'] = [1 if x >= 3 else 0 for x in test_df_relevance['rating']]

## Mapeo de indices

In [None]:
from scipy.sparse import csr_matrix

user_ids = train_df['user_id'].unique()
item_ids = train_df['item_id'].unique()

user_id_map = {original_id: index for index, original_id in enumerate(user_ids)}
item_id_map = {original_id: index for index, original_id in enumerate(item_ids)}


train_df['user_index'] = train_df['user_id'].map(user_id_map)
train_df['item_index'] = train_df['item_id'].map(item_id_map)

user_item_matrix = csr_matrix((train_df['rating'], (train_df['user_index'], train_df['item_index'])))


## Mapeo de users: [items]

In [None]:
user_item_train = train_df.groupby('user_id')['item_id'].apply(set).to_dict()
user_item_val = val_df.groupby('user_id')['item_id'].apply(set).to_dict()
user_item_test = test_df.groupby('user_id')['item_id'].apply(set).to_dict()

## Obtener popularidad de los items

In [None]:
item_popularity = ratings_df['item_id'].value_counts()

total_ratings = len(ratings_df)

item_popularity_dict = (item_popularity / total_ratings).to_dict()

# Metricas Base

In [None]:
import numpy as np

## AP@K

In [None]:


def average_precision_at_k(relevance_vector, k):

  def precision_at_k(ranking, k):
    assert 1 <= k <= ranking.size
    return np.asarray(ranking)[:k].mean()


  total_sum = 0
  relevance_vector = np.asarray(relevance_vector)
  n_relevant = sum(relevance_vector)
  if n_relevant == 0:
    return 0

  for i in range(k):
    total_sum += precision_at_k(relevance_vector, i+1) * relevance_vector[i]
  return total_sum / min(k, n_relevant)

## NDCG@K


In [None]:
def ndcg_at_k(relevance_vector, k):

  if relevance_vector.sum() == 0:
    return 0

  def dcg_at_k(ranking, k):
    total_sum = 0
    ranking = np.asarray(ranking)[:k]
    for i in range(len(ranking)):
      total_sum += ranking[i] / np.log2(i+2) # +2 porque i parte en 1 y se le suma 1
    return total_sum


  dcg = dcg_at_k(relevance_vector, k)
  idcg = dcg_at_k(sorted(relevance_vector, reverse=True), k)
  return dcg / idcg

## Diversity


In [None]:
def diversity(recomended_items):
  categories = set()
  for item_id in recomended_items:
    #Se ignoran los items que no se encuentren en el set de validacion
    if item_id not in item_genre_dict:
      continue
    for category in item_genre_dict[item_id]:
      categories.add(category)
  return len(categories)

## Novelty

In [None]:
from numpy import log2

def novelty_val(recomended_items):
  novelty = 0
  for item_id in recomended_items:
    #Se ignoran los items que no se encuentren en el set de validacion
    if item_id not in item_popularity_dict:
      continue

    novelty += log2(1/ item_popularity_dict[item_id])
  return novelty / len(recomended_items)



# Funciones de recomendación

In [None]:
def get_rel_vector(model, n, user_id, library_or_type, val = False):
  rec_items = []
  if library_or_type == 'implicit':
    user_row = user_id_map[user_id]

    recommended_item_indices = model.recommend(user_row, user_item_matrix[user_row], n)[0]np
    index_to_item_id = {index: original_id for original_id, index in item_id_map.items()}
    rec_items = [index_to_item_id[index] for index in recommended_item_indices]

  elif library_or_type == 'random' or library_or_type == 'popular':
      rec_items = model.reccomend(library_or_type, n)


  relevant_items = user_item_val[user_id] if val else user_item_test[user_id]


  rel_vector = np.isin(rec_items, list(relevant_items))

  return rel_vector

## Metricas base promedio

In [None]:
def get_base_metrics(model, n, library_or_type, val = False):
  mean_ap = 0
  mean_ndcg = 0
  mean_diversity = 0
  mean_novelty = 0

  if val == False:
    users_to_evaluate = user_item_test.keys()
  else:
    users_to_evaluate = user_item_val.keys()


  for user_id in users_to_evaluate:
    rel_vector = get_rel_vector(model, n, user_id, library_or_type, val)
    mean_ap += average_precision_at_k(rel_vector, n)
    mean_ndcg += ndcg_at_k(rel_vector, n)

    recommended_item_ids = []
    if library_or_type == 'implicit':
        user_row = user_id_map[user_id]
        recommended_item_indices = model.recommend(user_row, user_item_matrix[user_row], n)[0]
        index_to_item_id = {index: original_id for original_id, index in item_id_map.items()}
        recommended_item_ids = [index_to_item_id[index] for index in recommended_item_indices]
    elif library_or_type == 'random' or library_or_type == 'popular':
        recommended_item_ids = model.reccomend(library_or_type, n)


    mean_diversity += diversity(recommended_item_ids)
    mean_novelty += novelty_val(recommended_item_ids)

  mean_ap /= len(users_to_evaluate)
  mean_ndcg /= len(users_to_evaluate)
  mean_diversity /= len(users_to_evaluate)
  mean_novelty /= len(users_to_evaluate)
  return mean_ap, mean_ndcg, mean_diversity, mean_novelty

# Recomendador base

In [None]:
import random

class RecomendadorBase:
  def __init__(self):
    pass
  def reccomend(self,reccomender_type, n):
    if reccomender_type == "random":
        rec = random.sample(list(item_id_map.keys()), n)
    elif reccomender_type == "popular":
        rec = list(item_popularity.index[:n])
    return rec

# Baseline popular

In [None]:

mean_ap, mean_ndcg, mean_diversity, mean_novelty = get_base_metrics(RecomendadorBase(), 10, "popular")
print("Mean AP:", mean_ap)
print("Mean NDCG:", mean_ndcg)
print("Mean Diversity:", mean_diversity)
print("Mean Novelty:", mean_novelty)

Mean AP: 0.2436667018462225
Mean NDCG: 0.35208053976451936
Mean Diversity: 12.0
Mean Novelty: 7.689384651348583


# Baseline random

In [None]:
mean_ap_random = 0
mean_ndcg_random = 0
mean_diversity_random = 0
mean_novelty_random = 0

for i in range(100):
   mean_ap_i, mean_ndcg_i, mean_diversity_i, mean_novelty_i = get_base_metrics(RecomendadorBase(), 10, "random")
   mean_ap_random += mean_ap_i
   mean_ndcg_random += mean_ndcg_i
   mean_diversity_random += mean_diversity_i
   mean_novelty_random += mean_novelty_i

mean_ap_random = mean_ap_random / 100
mean_ndcg_random = mean_ndcg_random / 100
mean_diversity_random = mean_diversity_random / 100
mean_novelty_random = mean_novelty_random / 100

print("Mean AP:", mean_ap_random)
print("Mean NDCG:", mean_ndcg_random)
print("Mean Diversity:", mean_diversity_random)
print("Mean Novelty:", mean_novelty_random)

Mean AP: 0.03669826283350221
Mean NDCG: 0.056717732321690535
Mean Diversity: 8.773872340425529
Mean Novelty: 11.983464197544722


# Métrica CO2E

In [None]:
# datos obtenidos de google cloud
carbon_intensity_dict ={'africa-south1': 657,
 'asia-east1': 439,
 'asia-east2': 505,
 'asia-northeast1': 453,
 'asia-northeast2': 296,
 'asia-northeast3': 357,
 'asia-south1': 679,
 'asia-south2': 532,
 'asia-southeast1': 367,
 'asia-southeast2': 561,
 'australia-southeast1': 498,
 'australia-southeast2': 454,
 'europe-central2': 643,
 'europe-north1': 39,
 'europe-north2': 3,
 'europe-southwest1': 89,
 'europe-west1': 103,
 'europe-west2': 106,
 'europe-west3': 276,
 'europe-west4': 209,
 'europe-west6': 15,
 'europe-west8': 202,
 'europe-west9': 16,
 'europe-west10': 276,
 'europe-west12': 202,
 'me-central2': 382,
 'me-west1': 434,
 'northamerica-northeast1': 5,
 'northamerica-northeast2': 59,
 'northamerica-south1': 305,
 'southamerica-east1': 67,
 'southamerica-west1': 238,
 'us-central1': 413,
 'us-east1': 576,
 'us-east2': 340,
 'us-east4': 323,
 'us-south1': 303,
 'us-west1': 79,
 'us-west2': 169,
 'us-west3': 555,}
mean_carbon_intensity = sum(carbon_intensity_dict.values()) / len(carbon_intensity_dict)
print(mean_carbon_intensity)

305.625


In [None]:
energy_consumption_t4 = 70 #Watts
mean_ap_random = 0.03669826283350221
mean_ndcg_random = 0.056717732321690535


def get_co2e(time):
  return mean_carbon_intensity * energy_consumption_t4 * time * 1/1000

def get_co2e_metrics(map, ndcg, time):
  co2e = get_co2e(time)
  map_c = (map- mean_ap_random)/ co2e *100
  ndcg_c = (ndcg- mean_ndcg_random)/ co2e *100
  return map_c, ndcg_c, co2e

# Métrica Base con modelo ALS con hiperparámetos por defecto

Hipeparametros por defecto: factors = 100, regularization = 0.01 (https://benfred.github.io/implicit/api/models/cpu/als.html)

*Usar T4*

In [None]:

default_als = implicit.als.AlternatingLeastSquares(use_gpu=True)
training_start = time.time()
default_als.fit(user_item_matrix)
training_end = time.time()
als_default_time = training_end - training_start

print("Training time:", als_default_time)

  0%|          | 0/15 [00:00<?, ?it/s]

Training time: 0.10803484916687012


In [None]:
mean_ap, mean_ndcg, mean_diversity, mean_novelty = get_base_metrics(default_als, 10, "implicit")
map_c, ndcg_c, co2e = get_co2e_metrics(mean_ap, mean_ndcg, als_default_time)

print("Mean AP:", mean_ap)
print("Mean NDCG:", mean_ndcg)
print("Mean Diversity:", mean_diversity)
print("Mean Novelty:", mean_novelty)
print("CO2E (gCO2):", co2e)
print("MAP-C (%):", map_c)
print("NDCG-C (%):", ndcg_c)

Mean AP: 0.3401664807175821
Mean NDCG: 0.46611771178604927
Mean Diversity: 9.720212765957447
Mean Novelty: 9.052735890923314
CO2E (gCO2): 2.3112705543637277
MAP-C (%): 13.129930518568045
NDCG-C (%): 17.713200157004678
