In [1]:
# LIBRERIAS
import random 
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, accuracy, BaselineOnly, SVD
from surprise.model_selection import cross_validate, train_test_split

# SEMILLA
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
def factorization_to_ratings(df_exercises: pd.DataFrame, df_matrix: pd.DataFrame)-> pd.DataFrame:
    rows = []
    for _, row_matrix in df_matrix.iterrows():
        user_id = row_matrix['rut']
        for exercise in df_matrix.columns[1:]:
            if row_matrix[exercise] == 1:
                row_exercise = df_exercises.iloc[int(exercise.lstrip('e'))]
                new_row = {'rut': user_id}
                new_row.update(row_exercise.to_dict())
                rows.append(new_row)
    df = pd.DataFrame(rows)      
    return df


def suprise_get_top_n_recommendations(df, model, feature_users: str = 'rut', feature_items: str = 'oid', user_id: int = 100, n_recommenders: int = 10):
    """
    df: dataframe de los registros de los ejercicios hechos por los usuarios.
    model: modelo que se usara para predecir.
    user_id: rut del usuario al que se quiere generar las recomendaciones.
    n_recommenders: cantidad de las recomendaciones que se le quiere dar al usuario.
    """

    items_interacted = df[df[feature_users] == user_id][feature_items].unique()
    all_items = df[feature_items].unique()

    items_pairs = [(user_id, ejer_oid, 0) for ejer_oid in list(set(all_items) - set(items_interacted))]

    predictions = model.test(items_pairs)

    top_n_recommendations = sorted(predictions, key = lambda x: x.est, reverse=True)[:n_recommenders]
    user_recommendation = [str(pred.iid) for pred in top_n_recommendations]
    
    return user_recommendation


In [3]:
# RUTA 
DIR_DATA_LIMPIA = '../datos/data-limpia'
FACTORIZATION = 'matriz-de-factorizacion.csv'
EJERCICIOS = 'catalogo-de-ejercicios.csv'

# IMPORTAR DATA
ejercicios = pd.read_csv(f"{DIR_DATA_LIMPIA}/{EJERCICIOS}", sep=",", encoding="latin1")
matrix_factorization = pd.read_csv(f"{DIR_DATA_LIMPIA}/{FACTORIZATION}", sep=",", encoding="latin1")
df_ejercicios = ejercicios.drop(labels=['nombre', 'h4','h3','h2','h1','s4','s3','s2','s1','k4','k3','k2','k1'], axis=1)

In [4]:
ejercicios.head()

Unnamed: 0,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,CÃ¡lculo del dÃ­gito verificador del rut,0,0,0,1,0,0,0,1,0,1,1,1,279
1,16f619db31204ded9418136c4587ddd8,Calculadora GeomÃ©trica,0,0,1,0,0,0,0,1,0,0,0,1,529
2,17022c9ceac94ec5b2e7bc934c7b2d6f,Subsecuencias de ADN,0,0,1,0,0,0,1,1,0,1,1,1,567
3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
4,2437df93d3f44a87b00834072aeb1ab0,Nota Final,0,0,0,1,0,0,0,0,0,0,0,1,257


In [5]:
df_ejercicios.head()

Unnamed: 0,oid,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,279
1,16f619db31204ded9418136c4587ddd8,529
2,17022c9ceac94ec5b2e7bc934c7b2d6f,567
3,171b5e86d4fb47268f2692587fbec073,259
4,2437df93d3f44a87b00834072aeb1ab0,257


In [6]:
exercises_users = factorization_to_ratings(df_ejercicios, matrix_factorization)

In [7]:
exercises_users

Unnamed: 0,rut,oid,score
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,279
1,0,16f619db31204ded9418136c4587ddd8,529
2,0,171b5e86d4fb47268f2692587fbec073,259
3,0,2437df93d3f44a87b00834072aeb1ab0,257
4,0,29f15ef8dc32426f945f64e28c910a57,531
...,...,...,...
9842,1304,ac7382763e484d37908da54c076f7577,259
9843,1304,baf2f8e0167a4e089d2cec16582c9ae9,275
9844,1304,d8395f43e4a1454d90346ac5a1ba561a,259
9845,1305,80c61dae74fa4915bf272ab17dfa62ff,258


In [8]:
normalizador = MinMaxScaler()
exercises_users['score_normalized'] = normalizador.fit_transform(exercises_users[['score']])

In [9]:
print(exercises_users.shape)
exercises_users.head()

(9847, 4)


Unnamed: 0,rut,oid,score,score_normalized
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,279,0.064327
1,0,16f619db31204ded9418136c4587ddd8,529,0.795322
2,0,171b5e86d4fb47268f2692587fbec073,259,0.005848
3,0,2437df93d3f44a87b00834072aeb1ab0,257,0.0
4,0,29f15ef8dc32426f945f64e28c910a57,531,0.80117


In [10]:
reader_normal = Reader(rating_scale=(0, 1))
reader_normalized = Reader(rating_scale=(257, 2247))

In [11]:
data_normal = Dataset.load_from_df(exercises_users[["rut", "oid", "score"]], reader_normal)
data_normalized = Dataset.load_from_df(exercises_users[["rut", "oid", "score_normalized"]], reader_normalized)

In [12]:
trainset_normal, testset_normal = train_test_split(data_normal, test_size=0.20, random_state=42)
trainset_normalized, testset_normalized = train_test_split(data_normalized, test_size=0.20, random_state=42)

In [13]:
# MODEL NORMAL
model_normal = SVD()
model_normal.fit(trainset_normal)
predictions_normal = model_normal.test(testset_normal)
accuracy.rmse(predictions_normal)
accuracy.mse(predictions_normal)
accuracy.mae(predictions_normal)
print()

RMSE: 317.6669
MSE: 100912.2731
MAE:  301.9269



In [14]:
results_normal = cross_validate(BaselineOnly(), data_normal, measures=["RMSE", "MAE"], cv=5, verbose=True)
results_normal = pd.DataFrame(results_normal)
results_normal

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    320.3049317.5000321.2849318.8192319.0418319.39021.2997  
MAE (testset)     303.8822301.9076304.6938302.8939302.9797303.27140.9470  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,320.304894,303.882234,0.003478,0.002958
1,317.500038,301.907614,0.004851,0.002592
2,321.284876,304.693753,0.004814,0.003497
3,318.819184,302.893855,0.003569,0.002672
4,319.041777,302.979685,0.004766,0.002903


In [15]:
# MODEL NORMALIZED
model_normalized = SVD()
model_normalized.fit(trainset_normalized)
predictions_normalized = model_normalized.test(testset_normalized)
accuracy.rmse(predictions_normalized)
accuracy.mse(predictions_normalized)
accuracy.mae(predictions_normalized)
print()

RMSE: 256.8659
MSE: 65980.0768
MAE:  256.8657



In [16]:
results_normalized = cross_validate(BaselineOnly(), data_normalized, measures=["RMSE", "MAE"], cv=5, verbose=True)
results_normalized = pd.DataFrame(results_normalized)
results_normalized

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    256.8628256.8588256.8536256.8722256.8624256.86190.0061  
MAE (testset)     256.8626256.8586256.8534256.8720256.8623256.86180.0061  
Fit time          0.00    0.01    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,256.862782,256.862616,0.003418,0.003463
1,256.858778,256.858607,0.005809,0.00289
2,256.853578,256.853402,0.004115,0.00335
3,256.872157,256.872001,0.004072,0.002522
4,256.862439,256.862273,0.004985,0.002628


# PREDICCIONES

In [17]:
# Random User
random_user_id = random.randint(1, exercises_users['rut'].unique().max())
print(f'User: {random_user_id}')

User: 229


In [18]:
# Recommendations
recommendations_normal = suprise_get_top_n_recommendations(df=exercises_users, model=model_normal, user_id=random_user_id)
recommendations_normalized = suprise_get_top_n_recommendations(df=exercises_users, model=model_normalized, user_id=random_user_id)

In [19]:
print(f'Ejercicios realizados por el usuario [ {random_user_id} ]')
ejercicios[ejercicios['oid'].isin(exercises_users[exercises_users['rut'] == random_user_id]['oid'].values)].sort_values(by=["score"], ascending=True)

Ejercicios realizados por el usuario [ 229 ]


Unnamed: 0,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,2437df93d3f44a87b00834072aeb1ab0,Nota Final,0,0,0,1,0,0,0,0,0,0,0,1,257
25,89f44e7f5842479fb283e43c52ce067b,Ordenar tres nÃºmeros,0,0,0,1,0,0,0,0,0,0,0,1,257
23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0,258
17,718578451f3f4eca87437cadfe98d688,Suma de los N primeros nÃºmeros naturales,0,0,0,1,0,0,0,0,0,0,1,0,258
3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
32,ac7382763e484d37908da54c076f7577,Cajero AutomÃ¡tico Nivel 1,0,0,0,1,0,0,0,0,0,0,1,1,259
10,46850a246d48484b8f104f8aab5679b6,Descomponer un nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
29,a3963220090f4e50a266ce53d33b9841,AprobaciÃ³n de CrÃ©ditos,0,0,0,1,0,0,0,0,0,0,1,1,259
26,8f24397e36034cccb71e9d578975c33d,Contestador AutomÃ¡tico,0,0,0,1,0,0,0,0,0,0,1,1,259
22,7f60644b0a1b484681ae5c8e36166c58,Signo del ZodÃ­aco,0,0,0,1,0,0,0,1,0,0,0,1,273


In [20]:
print(f'Modelo 1 (Normal)\nUsuario: {random_user_id}')
ejercicios[ejercicios['oid'].isin(recommendations_normal)].sort_values(by=["score"], ascending=True)

Modelo 1 (Normal)
Usuario: 229


Unnamed: 0,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
7,31ea1c1b12174428b5a67a6576627de9,Resolver un sistema de ecuaciones,0,0,0,1,0,0,0,0,0,0,0,1,257
42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
11,4d51d13b9a2848a2803f7d1143fde6d4,Adivina la palabra,0,0,1,0,0,0,1,0,0,1,1,1,551
12,4d5ed23727c04a8790f2e4ab3f09767e,FunciÃ³n buscarTodas,0,0,1,0,0,0,1,0,0,1,1,1,551
31,abc052e584734d0f8121d5e5ca659f82,Carro de Compras,0,0,1,0,0,0,1,0,0,1,1,1,551
38,c0d9a3b2f7be46dab1f792c7fbd365b9,Decodificador,0,0,1,0,0,0,1,0,0,1,1,1,551
2,17022c9ceac94ec5b2e7bc934c7b2d6f,Subsecuencias de ADN,0,0,1,0,0,0,1,1,0,1,1,1,567
8,37e4f4a1e8174e9496d21b00d67fc8f1,Distancia Levenshtein,0,0,1,0,0,0,1,1,0,1,1,1,567
48,e72db1cb2e9f400990cfa3b464d0391a,Sopa de Letras,0,0,1,0,0,1,0,1,0,1,1,1,599


In [21]:
print(f'Modelo 1 (Normal)\nUsuario: {random_user_id}')
ejercicios[ejercicios['oid'].isin(recommendations_normalized)].sort_values(by=["score"], ascending=True)

Modelo 1 (Normal)
Usuario: 229


Unnamed: 0,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
7,31ea1c1b12174428b5a67a6576627de9,Resolver un sistema de ecuaciones,0,0,0,1,0,0,0,0,0,0,0,1,257
42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
11,4d51d13b9a2848a2803f7d1143fde6d4,Adivina la palabra,0,0,1,0,0,0,1,0,0,1,1,1,551
12,4d5ed23727c04a8790f2e4ab3f09767e,FunciÃ³n buscarTodas,0,0,1,0,0,0,1,0,0,1,1,1,551
31,abc052e584734d0f8121d5e5ca659f82,Carro de Compras,0,0,1,0,0,0,1,0,0,1,1,1,551
38,c0d9a3b2f7be46dab1f792c7fbd365b9,Decodificador,0,0,1,0,0,0,1,0,0,1,1,1,551
2,17022c9ceac94ec5b2e7bc934c7b2d6f,Subsecuencias de ADN,0,0,1,0,0,0,1,1,0,1,1,1,567
8,37e4f4a1e8174e9496d21b00d67fc8f1,Distancia Levenshtein,0,0,1,0,0,0,1,1,0,1,1,1,567
48,e72db1cb2e9f400990cfa3b464d0391a,Sopa de Letras,0,0,1,0,0,1,0,1,0,1,1,1,599
