In [None]:
import random 
import numpy as np 
import pandas as pd

from surprise import Dataset, Reader, NormalPredictor
from surprise.model_selection import train_test_split
from utils.transformacion import *
from utils.recomendaciones import recomendaciones_top_n_surprise
from utils.modelos import RecommenderSystemSurprise

VAR_SEED = 42
VAR_TESTSET_SIZE = 0.20
VAR_DIR_DATA_CLEAN = '../data/cleaning'

random.seed(VAR_SEED)
np.random.seed(VAR_SEED)

df_dataset = pd.read_csv(f"{VAR_DIR_DATA_CLEAN}/mf_dataset.csv", sep=",", encoding="latin1")

df_catalogo = pd.read_csv(f"{VAR_DIR_DATA_CLEAN}/catalogo.csv", sep=",", encoding="latin1")
df_catalogo = calcular_ratio_interacciones(df_dataset, df_catalogo)
df_catalogo = calcular_puntuacion_dataset(
    dataframe=df_catalogo, 
    caracteristicas={"hito": (1, 4), "skill": (0, 15), "knowledge": (1, 15), "ratio_interaccion": (0, 1)}, 
    pesos={"hito": 0.4, "skill": 0.25, "knowledge": 0.25, "ratio_interaccion": 0.1}, 
    nueva_columna='score'
)
df_catalogo = df_catalogo[['id_ejercicio', 'nombre', 'hito', 'skill', 'knowledge', 'complexity', 'ratio_interaccion', 'score']]

df_ratings = factorizacion_a_calificaciones(df_catalogo, df_dataset, 'id_estudiante')

In [None]:
import pandas as pd
from pandas import DataFrame
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import cross_validate
from joblib import dump, load
from typing import Tuple, Optional

class RecommenderSystemSurprise:
    def __init__(self, algorithm=SVD(), model_path: str = 'svd_model.joblib'):
        self.model = algorithm
        self.model_path = model_path
        self.data = None
        self.trainset = None

    def load_data(self, dataframe: DataFrame, rating_scale: Tuple[float, float]):
        reader = Reader(rating_scale=rating_scale)
        self.data = Dataset.load_from_df(dataframe, reader)

    def train(self):
        if not self.data:
            raise ValueError("Data not loaded. Use 'load_data' to load a DataFrame first.")
        self.trainset = self.data.build_full_trainset()
        self.model.fit(self.trainset)
        print("[+] Model trained.")

    def evaluate(self, cv: int = 5):
        if not self.data:
            raise ValueError("Data not loaded. Use 'load_data' to load a DataFrame first.")
        return cross_validate(self.model, self.data, measures=['RMSE', 'MAE'], cv=cv, verbose=True)

    def predict(self, user_id, item_id):
        if not self.trainset:
            raise ValueError("Model not trained. Use 'train' to train the model first.")
        return self.model.predict(user_id, item_id).est

    def get_recommendations(self, user_id: int, user_column: str = 'user_id', item_column: str = 'item_id', n_recommendations: int = 10):
        if not self.trainset:
            raise ValueError("Model not trained. Use 'train' to train the model first.")
        
        df = self.data.df
        items_interacted = df[df[user_column] == user_id][item_column].unique()
        all_items = df[item_column].unique()
        item_pairs = [(user_id, item, 0) for item in set(all_items) - set(items_interacted)]
        
        recommendations = sorted(self.model.test(item_pairs), key=lambda x: x.est, reverse=True)[:n_recommendations]
        return [int(pred.iid) for pred in recommendations]

    def save_model(self):
        dump(self.model, self.model_path)
        print(f"[+] Model saved to {self.model_path}.")

    def load_model(self):
        self.model = load(self.model_path)
        print(f"[+] Model loaded from {self.model_path}.")

    def update_and_retrain(self, new_dataframe: DataFrame, rating_scale: Tuple[float, float]):
        if not self.data:
            raise ValueError("Data not loaded. Use 'load_data' to load an initial DataFrame first.")
        
        updated_df = pd.concat([self.data.df, new_dataframe]).drop_duplicates()
        self.load_data(updated_df, rating_scale)
        self.train()
        print("[+] Model updated and retrained with new data.")

    def evaluate_recommendations(self, n_recommendations: int = 10) -> Tuple[float, float, float]:
        if not self.trainset:
            raise ValueError("Model not trained. Use 'train' to train the model first.")
        
        testset = self.trainset.build_anti_testset()
        predictions = self.model.test(testset)
        
        user_est_true = {}
        for pred in predictions:
            user_est_true.setdefault(pred.uid, []).append((pred.iid, pred.est, pred.r_ui))
        
        precisions, recalls, avg_precisions = [], [], []
        for ratings in user_est_true.values():
            ratings.sort(key=lambda x: x[1], reverse=True)
            relevant_items = sum((1 for (_, _, true_r) in ratings if true_r > 0))
            if relevant_items == 0:
                continue

            recommended_items = ratings[:n_recommendations]
            n_relevant_recommended = sum((1 for (_, _, true_r) in recommended_items if true_r > 0))
            
            precisions.append(n_relevant_recommended / n_recommendations)
            recalls.append(n_relevant_recommended / relevant_items)
            
            hits, avg_precision = 0, 0.0
            for i, (_, _, true_r) in enumerate(recommended_items):
                if true_r > 0:
                    hits += 1
                    avg_precision += hits / (i + 1)
            avg_precisions.append(avg_precision / min(relevant_items, n_recommendations))

        precision = sum(precisions) / len(precisions) if precisions else 0
        recall = sum(recalls) / len(recalls) if recalls else 0
        map_score = sum(avg_precisions) / len(avg_precisions) if avg_precisions else 0

        return precision, recall, map_score


In [13]:
recommender = RecommenderSystemSurprise(algorithm=NormalPredictor(), model_path='dumps/RecommenderSystemSurprise.joblib')
recommender.load_data(dataframe=df_ratings[['id_estudiante', 'id_ejercicio', 'score']], rating_scale=(0, 1))
recommender.train()
results = recommender.evaluate()

[+] Model trained.
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0814  0.0850  0.0845  0.0840  0.0831  0.0836  0.0013  
MAE (testset)     0.0646  0.0669  0.0665  0.0665  0.0656  0.0660  0.0008  
Fit time          0.02    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


In [14]:
results

{'test_rmse': array([0.08142641, 0.08503462, 0.08452267, 0.08399566, 0.0831398 ]),
 'test_mae': array([0.06458156, 0.06690754, 0.06647707, 0.06645095, 0.06558055]),
 'fit_time': (0.019783735275268555,
  0.008750200271606445,
  0.007593393325805664,
  0.011216878890991211,
  0.0070705413818359375),
 'test_time': (0.007931232452392578,
  0.00843501091003418,
  0.00542902946472168,
  0.005909442901611328,
  0.009302616119384766)}

In [15]:
recommender.predict(user_id=0, item_id=1)

0.12978040437936825

In [16]:
recomendaciones = recommender.get_recommendations(user_id=0, user_column='id_estudiante', item_column='id_ejercicio')

In [17]:
df_catalogo[df_catalogo['id_ejercicio'].isin(recomendaciones)]

Unnamed: 0,id_ejercicio,nombre,hito,skill,knowledge,complexity,ratio_interaccion,score
7,7,Resolver un sistema de ecuaciones,1,0,1,1,0.254569,0.025457
8,8,Distancia Levenshtein,2,3,7,55,0.027415,0.293218
9,9,Validar Secuencias de ADN,2,2,7,39,0.05483,0.279293
11,11,Adivina la palabra,2,2,7,39,0.049608,0.27877
13,13,Cajero AutomÃ¡tico Nivel 2,1,1,3,19,0.304178,0.082799
21,21,El antipoema,2,2,7,39,0.048303,0.27864
31,31,Carro de Compras,2,2,7,39,0.005222,0.274332
33,33,Alineamiento de Secuencias,2,4,7,71,0.01436,0.308579
37,37,Jerigonzo,2,2,7,39,0.240209,0.29783
48,48,Sopa de Letras,2,5,7,87,0.005222,0.324332


In [18]:
recommender.save_model()

[+] Model saved to dumps/RecommenderSystemSurprise.joblib.


In [19]:
recommender.load_model()

[+] Model loaded from dumps/RecommenderSystemSurprise.joblib.


In [20]:
recommender.update_and_retrain(new_dataframe=df_ratings[['id_estudiante', 'id_ejercicio', 'score']], rating_scale=(0, 1))

[+] Model trained.
[+] Model updated and retrained with new data.


In [21]:
metricas = recommender.evaluate_recommendations()

In [22]:
metricas

(0.982483660130719, 0.5959026017865332, 1.0)