In [1]:
import random 
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
from surprise import Dataset, Reader, accuracy, SVD, KNNBasic, KNNBaseline, BaselineOnly, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, KFold, PredefinedKFold

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

In [3]:
def maxtrix_fact_to_ratings(df_exercises, df_matrix):
        rows = []
        for _, row_matrix in df_matrix.iterrows():
                user_id = row_matrix['rut']
                for exercise in df_matrix.columns[1:]:
                        if row_matrix[exercise] == 1:
                                row_exercise = df_exercises.iloc[int(exercise.lstrip('e'))]
                                new_row = {'rut': user_id}
                                new_row.update(row_exercise.to_dict())
                                rows.append(new_row)
        df = pd.DataFrame(rows)      
        return df

def get_top_n_recommendations(model, df, user_id, n_recommenders=10):
  users_ejers = df[df['rut'] == user_id]['oid'].unique()
  all_ejers = df['oid'].unique()

  ejers_to_predict = list(set(all_ejers) - set(users_ejers))
  ejers_pairs = [(user_id, ejer_oid, 0) for ejer_oid in ejers_to_predict]
  predictions_cf = model.test(ejers_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est, reverse=True)[:n_recommenders]
  top_n_ejers_ids = [str(pred.iid) for pred in top_n_recommendations]
  return top_n_ejers_ids

In [4]:
# RUTA 
DIR_DATA_LIMPIA = '../../datos/data-limpia'

# IMPORTAR DATA

EJERCICIOS = 'catalogo-de-ejercicios-onehotencoded.csv'
ejercicios = pd.read_csv(f"{DIR_DATA_LIMPIA}/{EJERCICIOS}", sep=",", encoding="latin1")

M1 = 'matriz-ejercicios-1.csv'
M2021 = 'matriz-ejercicios-2021.csv'
df_matrix_e1 = pd.read_csv(f"{DIR_DATA_LIMPIA}/{M1}", sep=",", encoding="latin1")
df_matrix_e2 = pd.read_csv(f"{DIR_DATA_LIMPIA}/{M2021}", sep=",", encoding="latin1")

In [5]:
ejercicios.head()

Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1
0,4,2437df93d3f44a87b00834072aeb1ab0,Nota Final,0,0,0,1,0,0,0,0,0,0,0,1
1,7,31ea1c1b12174428b5a67a6576627de9,Resolver un sistema de ecuaciones,0,0,0,1,0,0,0,0,0,0,0,1
2,25,89f44e7f5842479fb283e43c52ce067b,Ordenar tres nÃºmeros,0,0,0,1,0,0,0,0,0,0,0,1
3,17,718578451f3f4eca87437cadfe98d688,Suma de los N primeros nÃºmeros naturales,0,0,0,1,0,0,0,0,0,0,1,0
4,23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0


In [6]:
ejercicios['score'] = ejercicios.iloc[:, 3:15].apply(lambda x: int(''.join(map(str, x)), 2), axis=1)

df_ejercicios = ejercicios.drop(labels=['idx', 'nombre', 'h4','h3','h2','h1','s4','s3','s2','s1','k4','k3','k2','k1'], axis=1)

df_ejercicios.head()

Unnamed: 0,oid,score
0,2437df93d3f44a87b00834072aeb1ab0,257
1,31ea1c1b12174428b5a67a6576627de9,257
2,89f44e7f5842479fb283e43c52ce067b,257
3,718578451f3f4eca87437cadfe98d688,258
4,80c61dae74fa4915bf272ab17dfa62ff,258


In [7]:
df1 = maxtrix_fact_to_ratings(df_ejercicios, df_matrix_e1)
df2 = maxtrix_fact_to_ratings(df_ejercicios, df_matrix_e2)

In [8]:
normalizador = MinMaxScaler()
df1['score_normalized'] = normalizador.fit_transform(df1[['score']])
df2['score_normalized'] = normalizador.fit_transform(df2[['score']])

In [9]:
print(df1.shape)
df1.head()

(6487, 4)


Unnamed: 0,rut,oid,score,score_normalized
0,0,2437df93d3f44a87b00834072aeb1ab0,257,0.0
1,0,31ea1c1b12174428b5a67a6576627de9,257,0.0
2,0,89f44e7f5842479fb283e43c52ce067b,257,0.0
3,0,718578451f3f4eca87437cadfe98d688,258,0.002924
4,0,80c61dae74fa4915bf272ab17dfa62ff,258,0.002924


In [10]:
print(df2.shape)
df2.head()

(6238, 4)


Unnamed: 0,rut,oid,score,score_normalized
0,1,2437df93d3f44a87b00834072aeb1ab0,257,0.0
1,1,89f44e7f5842479fb283e43c52ce067b,257,0.0
2,1,718578451f3f4eca87437cadfe98d688,258,0.003401
3,1,80c61dae74fa4915bf272ab17dfa62ff,258,0.003401
4,1,171b5e86d4fb47268f2692587fbec073,259,0.006803


In [11]:
reader_normal = Reader(rating_scale=(0, 1))
reader_normalized = Reader(rating_scale=(257, 2247))

# DATAFRAME 1 (df1)

In [12]:
# Dataframe 1
data1_normal = Dataset.load_from_df(df1[["rut", "oid", "score"]], reader_normal)
data1_normalized = Dataset.load_from_df(df1[["rut", "oid", "score_normalized"]], reader_normalized)

trainset_1_normal, testset_1_normal = train_test_split(data1_normal, test_size=0.20, random_state=42)
trainset_1_normalized, testset_1_normalized = train_test_split(data1_normalized, test_size=0.20, random_state=42)

In [13]:
# MODEL 1 (NORMAL)
model_1_svd_normal = SVD()
model_1_svd_normal.fit(trainset_1_normal)
predictions_1_normal = model_1_svd_normal.test(testset_1_normal)
accuracy.rmse(predictions_1_normal)
results1_normal = cross_validate(BaselineOnly(), data1_normal, measures=["RMSE", "MAE"], cv=5, verbose=True)
results1_normal = pd.DataFrame(results1_normal)
results1_normal

RMSE: 345.0079
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    346.0988348.8059341.0810348.6754350.4407347.02043.2787  
MAE (testset)     325.4653327.8097321.1997327.7571329.2575326.29792.8234  
Fit time          0.00    0.00    0.01    0.01    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,346.098791,325.465331,0.004315,0.002998
1,348.805888,327.809707,0.004705,0.002535
2,341.081038,321.199692,0.005253,0.002498
3,348.675444,327.757132,0.005209,0.004299
4,350.440702,329.257517,0.004439,0.002514


In [14]:
# MODEL 2 (NORMALIZED)
model_1_svd_normalized = SVD()
model_1_svd_normalized.fit(trainset_1_normalized)
predictions_1_normalized = model_1_svd_normalized.test(testset_1_normalized)
accuracy.rmse(predictions_1_normalized)
results1_normalized = cross_validate(BaselineOnly(), data1_normalized, verbose=True)
results1_normalized = pd.DataFrame(results1_normalized)
results1_normalized

RMSE: 256.7996
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    256.8006256.7931256.7858256.8037256.7903256.79470.0066  
MAE (testset)     256.8003256.7928256.7856256.8034256.7901256.79440.0066  
Fit time          0.00    0.00    0.00    0.00    0.01    0.00    0.00    
Test time         0.01    0.01    0.00    0.00    0.01    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,256.800578,256.80035,0.003923,0.00655
1,256.793061,256.792828,0.004678,0.005385
2,256.785805,256.785565,0.004789,0.003039
3,256.80366,256.803435,0.004469,0.002536
4,256.790307,256.790071,0.006969,0.005491


In [15]:
# PREDICCIONES

# random user
user_id = random.randint(1, df1['rut'].unique().max())

# recommendations
recommendations1 = get_top_n_recommendations(model_1_svd_normal, df1, user_id)
recommendations2 = get_top_n_recommendations(model_1_svd_normalized, df1, user_id)

print(f'Ejercicios realizados por el usuario [ {user_id} ]')
ejercicios[ejercicios['oid'].isin(df1[df1['rut'] == user_id]['oid'].values)]

Ejercicios realizados por el usuario [ 328 ]


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score


In [16]:
print(f'Modelo 1 (Normal)\nUsuario: {user_id}')
ejercicios[ejercicios['oid'].isin(recommendations1)]

Modelo 1 (Normal)
Usuario: 328


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0,258
5,3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
6,10,46850a246d48484b8f104f8aab5679b6,Descomponer un nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
7,18,729d37da8f2d46f3af2d891df04949ef,Juego Adivina mi nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
11,42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
18,44,db7987d040dc469a9c247d54dd72939a,NÃºmeros Perfectos,0,0,1,0,0,0,0,1,0,0,1,1,531
22,21,7da6cedd04c44a15b5e421440253acff,El antipoema,0,0,1,0,0,0,1,0,0,1,1,1,551
24,31,abc052e584734d0f8121d5e5ca659f82,Carro de Compras,0,0,1,0,0,0,1,0,0,1,1,1,551
25,37,c0a2cd8059d44afb822a031066678092,Jerigonzo,0,0,1,0,0,0,1,0,0,1,1,1,551


In [17]:
print(f'Modelo 1 (Normal)\nUsuario: {user_id}')
ejercicios[ejercicios['oid'].isin(recommendations2)]

Modelo 1 (Normal)
Usuario: 328


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0,258
5,3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
6,10,46850a246d48484b8f104f8aab5679b6,Descomponer un nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
7,18,729d37da8f2d46f3af2d891df04949ef,Juego Adivina mi nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
11,42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
18,44,db7987d040dc469a9c247d54dd72939a,NÃºmeros Perfectos,0,0,1,0,0,0,0,1,0,0,1,1,531
22,21,7da6cedd04c44a15b5e421440253acff,El antipoema,0,0,1,0,0,0,1,0,0,1,1,1,551
24,31,abc052e584734d0f8121d5e5ca659f82,Carro de Compras,0,0,1,0,0,0,1,0,0,1,1,1,551
25,37,c0a2cd8059d44afb822a031066678092,Jerigonzo,0,0,1,0,0,0,1,0,0,1,1,1,551


# DATAFRAME 2 (df2)

In [18]:
# Dataframe 2
data2_normal = Dataset.load_from_df(df2[["rut", "oid", "score"]], reader_normal)
data2_normalized = Dataset.load_from_df(df2[["rut", "oid", "score_normalized"]], reader_normalized)

trainset_2_normal, testset_2_normal = train_test_split(data2_normal, test_size=0.20, random_state=42)
trainset_2_normalized, testset_2_normalized = train_test_split(data2_normalized, test_size=0.20, random_state=42)

In [19]:
# MODEL 2 (NORMAL)
model_2_svd_normal = SVD()
model_2_svd_normal.fit(trainset_2_normal)
predictions_2_normal = model_2_svd_normal.test(testset_2_normal)
accuracy.rmse(predictions_2_normal)
results2_normal = cross_validate(BaselineOnly(), data2_normal, measures=["RMSE", "MAE"], cv=5, verbose=True)
results2_normal = pd.DataFrame(results2_normal)
results2_normal

RMSE: 263.3301
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    264.6023265.3117265.3445263.4723264.3440264.61490.6924  
MAE (testset)     263.1939263.6034263.7516262.5188263.1957263.25270.4283  
Fit time          0.00    0.00    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,264.602257,263.19391,0.004833,0.002559
1,265.311698,263.603365,0.004927,0.00244
2,265.344515,263.751603,0.006428,0.002795
3,263.47226,262.518845,0.007193,0.002518
4,264.343982,263.19567,0.007054,0.002492


In [20]:
# MODEL 2 (NORMALIZED)
model_2_svd_normalized = SVD()
model_2_svd_normalized.fit(trainset_2_normalized)
predictions_2_normalized = model_2_svd_normalized.test(testset_2_normalized)
accuracy.rmse(predictions_2_normalized)
results2_normalized = cross_validate(BaselineOnly(), data2_normalized, verbose=True)
results2_normalized = pd.DataFrame(results2_normalized)
results2_normalized

RMSE: 256.9784
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    256.9748256.9766256.9729256.9762256.9762256.97530.0014  
MAE (testset)     256.9748256.9766256.9729256.9762256.9762256.97530.0014  
Fit time          0.01    0.01    0.01    0.02    0.01    0.01    0.00    
Test time         0.00    0.00    0.01    0.00    0.01    0.00    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,256.974845,256.974828,0.009096,0.002943
1,256.976636,256.976621,0.00914,0.003704
2,256.972882,256.972863,0.011431,0.005637
3,256.976166,256.976152,0.018749,0.004739
4,256.976206,256.97619,0.007658,0.007494


In [21]:
# PREDICCIONES

# random user
user_id = random.randint(1, df2['rut'].unique().max())

# recommendations
recommendations1 = get_top_n_recommendations(model_2_svd_normal, df2, user_id)
recommendations2 = get_top_n_recommendations(model_2_svd_normalized, df2, user_id)

print(f'Ejercicios realizados por el usuario [ {user_id} ]')
ejercicios[ejercicios['oid'].isin(df2[df2['rut'] == user_id]['oid'].values)]

Ejercicios realizados por el usuario [ 115 ]


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,4,2437df93d3f44a87b00834072aeb1ab0,Nota Final,0,0,0,1,0,0,0,0,0,0,0,1,257
2,25,89f44e7f5842479fb283e43c52ce067b,Ordenar tres nÃºmeros,0,0,0,1,0,0,0,0,0,0,0,1,257
3,17,718578451f3f4eca87437cadfe98d688,Suma de los N primeros nÃºmeros naturales,0,0,0,1,0,0,0,0,0,0,1,0,258
8,26,8f24397e36034cccb71e9d578975c33d,Contestador AutomÃ¡tico,0,0,0,1,0,0,0,0,0,0,1,1,259
9,29,a3963220090f4e50a266ce53d33b9841,AprobaciÃ³n de CrÃ©ditos,0,0,0,1,0,0,0,0,0,0,1,1,259


In [22]:
print(f'Modelo 1 (Normal)\nUsuario: {user_id}')
ejercicios[ejercicios['oid'].isin(recommendations1)]

Modelo 1 (Normal)
Usuario: 115


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0,258
5,3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
6,10,46850a246d48484b8f104f8aab5679b6,Descomponer un nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
7,18,729d37da8f2d46f3af2d891df04949ef,Juego Adivina mi nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
11,42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
15,0,0973dae0e1b74ab8baa8d94339ee3ae6,CÃ¡lculo del dÃ­gito verificador del rut,0,0,0,1,0,0,0,1,0,1,1,1,279
18,44,db7987d040dc469a9c247d54dd72939a,NÃºmeros Perfectos,0,0,1,0,0,0,0,1,0,0,1,1,531
22,21,7da6cedd04c44a15b5e421440253acff,El antipoema,0,0,1,0,0,0,1,0,0,1,1,1,551
25,37,c0a2cd8059d44afb822a031066678092,Jerigonzo,0,0,1,0,0,0,1,0,0,1,1,1,551


In [23]:
print(f'Modelo 1 (Normal)\nUsuario: {user_id}')
ejercicios[ejercicios['oid'].isin(recommendations2)]

Modelo 1 (Normal)
Usuario: 115


Unnamed: 0,idx,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,23,80c61dae74fa4915bf272ab17dfa62ff,Conversor de Decimal a Binario,0,0,0,1,0,0,0,0,0,0,1,0,258
5,3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
6,10,46850a246d48484b8f104f8aab5679b6,Descomponer un nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
7,18,729d37da8f2d46f3af2d891df04949ef,Juego Adivina mi nÃºmero,0,0,0,1,0,0,0,0,0,0,1,1,259
11,42,d8395f43e4a1454d90346ac5a1ba561a,NÃºmeros Amigos,0,0,0,1,0,0,0,0,0,0,1,1,259
13,13,52620b0c858a4c59bc324b65278d28bd,Cajero AutomÃ¡tico Nivel 2,0,0,0,1,0,0,0,1,0,0,1,1,275
15,0,0973dae0e1b74ab8baa8d94339ee3ae6,CÃ¡lculo del dÃ­gito verificador del rut,0,0,0,1,0,0,0,1,0,1,1,1,279
18,44,db7987d040dc469a9c247d54dd72939a,NÃºmeros Perfectos,0,0,1,0,0,0,0,1,0,0,1,1,531
22,21,7da6cedd04c44a15b5e421440253acff,El antipoema,0,0,1,0,0,0,1,0,0,1,1,1,551
25,37,c0a2cd8059d44afb822a031066678092,Jerigonzo,0,0,1,0,0,0,1,0,0,1,1,1,551
