In [None]:
import random 
import numpy as np 
import pandas as pd

from surprise import Dataset, Reader, NormalPredictor
from surprise.model_selection import train_test_split
from utils.transformacion import *
from utils.recomendaciones import recomendaciones_top_n_surprise

VAR_SEED = 42
VAR_TESTSET_SIZE = 0.20
VAR_DIR_DATA_CLEAN = '../data/cleaning'

random.seed(VAR_SEED)
np.random.seed(VAR_SEED)

In [2]:
df_dataset = pd.read_csv(f"{VAR_DIR_DATA_CLEAN}/mf_dataset.csv", sep=",", encoding="latin1")

df_catalogo = pd.read_csv(f"{VAR_DIR_DATA_CLEAN}/catalogo.csv", sep=",", encoding="latin1")
df_catalogo = calcular_ratio_interacciones(df_dataset, df_catalogo)
df_catalogo = calcular_puntuacion_dataset(
    dataframe=df_catalogo, 
    caracteristicas={"hito": (1, 4), "skill": (0, 15), "knowledge": (1, 15), "interaction_ratio": (0.0, 1.0)}, 
    pesos={"hito": 0.4, "skill": 0.25, "knowledge": 0.25, "interaction_ratio": 0.1}, 
    nueva_columna='score'
)
df_catalogo = df_catalogo[['id_ejercicio', 'nombre', 'hito', 'skill', 'knowledge', 'complexity', 'complexity12', 'score']]

df_ratings = factorizacion_a_calificaciones(df_catalogo, df_dataset, 'id_estudiante')

In [3]:
# ESCALAS DE RANKING 
escala_01 = Reader(rating_scale=(0, 1))
escala_02 = Reader(rating_scale=(1, 255))
escala_03 = Reader(rating_scale=(257, 1279))

# CARGA DE DATOS
data_formula = Dataset.load_from_df(df_ratings[["id_estudiante", "id_ejercicio", "score"]], escala_01)
data_complexity = Dataset.load_from_df(df_ratings[["id_estudiante", "id_ejercicio", "complexity"]], escala_02)
data_complexity12 = Dataset.load_from_df(df_ratings[["id_estudiante", "id_ejercicio", "complexity12"]], escala_03)

# MODELOS
benchmark_formula = evaluar_algoritmos(data_formula)
benchmark_complexity = evaluar_algoritmos(data_complexity)
benchmark_complexity12 = evaluar_algoritmos(data_complexity12)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

In [4]:
benchmark_formula

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time,Puntaje de Referencia
Algoritmo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNBasic,0.001563,5.3e-05,0.097681,0.473819,639.852772
KNNBaseline,0.002715,0.001553,0.163426,0.724151,368.358671
BaselineOnly,0.011869,0.00384,0.010549,0.005332,84.25085
SVDpp,0.014795,0.008974,0.23316,0.0552,67.591908
KNNWithMeans,0.03062,0.0236,0.110686,0.490077,32.657942
SVD,0.036783,0.02483,0.099286,0.009914,27.186431
KNNWithZScore,0.039521,0.024978,0.147959,0.518372,25.303123
NMF,0.058026,0.039264,0.175097,0.008774,17.233628
SlopeOne,0.073788,0.053771,0.012463,0.033519,13.552275
CoClustering,0.096807,0.060656,0.150814,0.008269,10.329791


In [5]:
benchmark_complexity

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time,Puntaje de Referencia
Algoritmo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNBasic,0.174525,0.007214,0.111034,0.538729,5.729829
SlopeOne,0.366903,0.01961,0.014821,0.025817,2.725513
KNNBaseline,0.477673,0.249096,0.10255,0.469287,2.093483
SVDpp,1.061456,0.805739,0.231534,0.050506,0.942102
SVD,1.729161,0.465078,0.100897,0.012616,0.578315
BaselineOnly,2.329557,0.614496,0.010193,0.004584,0.429266
CoClustering,3.670393,2.378375,0.153698,0.009563,0.27245
KNNWithMeans,4.097729,2.876826,0.108942,0.463959,0.244038
KNNWithZScore,5.84885,3.524999,0.13208,0.485883,0.170974
NMF,7.955726,4.920195,0.140372,0.009053,0.125696


In [6]:
benchmark_complexity12

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time,Puntaje de Referencia
Algoritmo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNBasic,1.795403,0.066414,0.116891,0.584809,0.556978
SlopeOne,2.219657,0.086497,0.015493,0.022998,0.45052
KNNBaseline,3.119317,1.798025,0.115625,0.606859,0.320583
BaselineOnly,13.643369,4.710061,0.012201,0.005177,0.073296
CoClustering,32.355204,19.313516,0.19603,0.008117,0.030907
KNNWithMeans,36.991171,23.92069,0.118367,0.499011,0.027033
KNNWithZScore,58.304219,29.400281,0.13702,0.501577,0.017151
NMF,111.180922,47.721683,0.149936,0.00885,0.008994
NormalPredictor,126.622107,89.720437,0.007825,0.006057,0.007898
SVD,979.494162,974.319834,0.10079,0.011923,0.001021


In [7]:
train_formula, test_formula = train_test_split(data_formula, test_size=VAR_TESTSET_SIZE, random_state=VAR_SEED)
train_complexity, test_complexity = train_test_split(data_complexity, test_size=VAR_TESTSET_SIZE, random_state=VAR_SEED)
train_complexity12, test_complexity12 = train_test_split(data_complexity12, test_size=VAR_TESTSET_SIZE, random_state=VAR_SEED)

In [8]:
modelo_1 = crear_y_evaluar_modelo_surprise(train_formula, test_formula, NormalPredictor())
modelo_2 = crear_y_evaluar_modelo_surprise(train_complexity, test_complexity, NormalPredictor())
modelo_3 = crear_y_evaluar_modelo_surprise(train_complexity12, test_complexity12, NormalPredictor())

RMSE: 0.0964
MSE: 0.0093
MAE:  0.0726

RMSE: 14.5489
MSE: 211.6699
MAE:  10.8991

RMSE: 128.0208
MSE: 16389.3357
MAE:  90.9478



# Predicciones

In [9]:
# id => 0 | 1305
id_estudiante = 0
print(f'Ejercicios realizados por el usuario [ {id_estudiante} ]')
df_catalogo[df_catalogo['id_ejercicio'].isin(df_ratings[df_ratings['id_estudiante'] == id_estudiante]['id_ejercicio'].values)].sort_values(by=["complexity"], ascending=True)

Ejercicios realizados por el usuario [ 0 ]


Unnamed: 0,id_ejercicio,nombre,hito,skill,knowledge,complexity,complexity12,score
4,4,Nota Final,1,0,1,1,257,0.0
25,25,Ordenar tres nÃºmeros,1,0,1,1,257,0.0
17,17,Suma de los N primeros nÃºmeros naturales,1,0,2,2,258,0.017857
23,23,Conversor de Decimal a Binario,1,0,2,2,258,0.017857
3,3,NÃºmeros Primos,1,0,3,3,259,0.035714
10,10,Descomponer un nÃºmero,1,0,3,3,259,0.035714
18,18,Juego Adivina mi nÃºmero,1,0,3,3,259,0.035714
26,26,Contestador AutomÃ¡tico,1,0,3,3,259,0.035714
29,29,AprobaciÃ³n de CrÃ©ditos,1,0,3,3,259,0.035714
42,42,NÃºmeros Amigos,1,0,3,3,259,0.035714


In [None]:
print(f'Estudiante: {id_estudiante}\nModelo 1')
recomendaciones_modelo_1 = recomendaciones_top_n_surprise(modelo_1, df_ratings, columna_usuarios='id_estudiante', columna_items='id_ejercicio', id_usuario=id_estudiante, n_recomendaciones=10)
df_catalogo[df_catalogo['id_ejercicio'].isin(recomendaciones_modelo_1)]

Estudiante: 0
Modelo 1


Unnamed: 0,id_ejercicio,nombre,hito,skill,knowledge,complexity,complexity12,score
8,8,Distancia Levenshtein,2,3,7,55,567,0.290476
9,9,Validar Secuencias de ADN,2,2,7,39,551,0.27381
12,12,FunciÃ³n buscarTodas,2,2,7,39,551,0.27381
21,21,El antipoema,2,2,7,39,551,0.27381
28,28,Encriptador ROT13,2,2,7,39,551,0.27381
32,32,Cajero AutomÃ¡tico Nivel 1,1,0,3,3,259,0.035714
33,33,Alineamiento de Secuencias,2,4,7,71,583,0.307143
37,37,Jerigonzo,2,2,7,39,551,0.27381
38,38,Decodificador,2,2,7,39,551,0.27381
48,48,Sopa de Letras,2,5,7,87,599,0.32381


In [None]:
print(f'Estudiante: {id_estudiante}\nModelo 2')
recomendaciones_modelo_2 = recomendaciones_top_n_surprise(modelo_2, df_ratings, columna_usuarios='id_estudiante', columna_items='id_ejercicio', id_usuario=id_estudiante, n_recomendaciones=10)
df_catalogo[df_catalogo['id_ejercicio'].isin(recomendaciones_modelo_2)]

Estudiante: 0
Modelo 2


Unnamed: 0,id_ejercicio,nombre,hito,skill,knowledge,complexity,complexity12,score
8,8,Distancia Levenshtein,2,3,7,55,567,0.290476
9,9,Validar Secuencias de ADN,2,2,7,39,551,0.27381
11,11,Adivina la palabra,2,2,7,39,551,0.27381
12,12,FunciÃ³n buscarTodas,2,2,7,39,551,0.27381
21,21,El antipoema,2,2,7,39,551,0.27381
28,28,Encriptador ROT13,2,2,7,39,551,0.27381
32,32,Cajero AutomÃ¡tico Nivel 1,1,0,3,3,259,0.035714
35,35,Factores Primos,1,1,3,19,275,0.052381
37,37,Jerigonzo,2,2,7,39,551,0.27381
38,38,Decodificador,2,2,7,39,551,0.27381


In [None]:
print(f'Estudiante: {id_estudiante}\nModelo 3')
recomendaciones_modelo_3 = recomendaciones_top_n_surprise(modelo_3, df_ratings, columna_usuarios='id_estudiante', columna_items='id_ejercicio', id_usuario=id_estudiante, n_recomendaciones=10)
df_catalogo[df_catalogo['id_ejercicio'].isin(recomendaciones_modelo_3)]

Estudiante: 0
Modelo 3


Unnamed: 0,id_ejercicio,nombre,hito,skill,knowledge,complexity,complexity12,score
7,7,Resolver un sistema de ecuaciones,1,0,1,1,257,0.0
8,8,Distancia Levenshtein,2,3,7,55,567,0.290476
9,9,Validar Secuencias de ADN,2,2,7,39,551,0.27381
11,11,Adivina la palabra,2,2,7,39,551,0.27381
13,13,Cajero AutomÃ¡tico Nivel 2,1,1,3,19,275,0.052381
21,21,El antipoema,2,2,7,39,551,0.27381
33,33,Alineamiento de Secuencias,2,4,7,71,583,0.307143
35,35,Factores Primos,1,1,3,19,275,0.052381
37,37,Jerigonzo,2,2,7,39,551,0.27381
48,48,Sopa de Letras,2,5,7,87,599,0.32381
