In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

# SEMILLA
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

2024-07-10 01:29:49.243495: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-10 01:29:49.256595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 01:29:49.273495: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 01:29:49.273528: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 01:29:49.284887: I tensorflow/core/platform/cpu_feature_gua

In [2]:
# RUTA 
DIR_DATA_LIMPIA = '../datos/data-limpia'
FACTORIZATION = 'matriz-de-factorizacion.csv'
EJERCICIOS = 'catalogo-de-ejercicios.csv'

# IMPORTAR DATA
ejercicios = pd.read_csv(f"{DIR_DATA_LIMPIA}/{EJERCICIOS}", sep=",", encoding="latin1")
matrix_factorization = pd.read_csv(f"{DIR_DATA_LIMPIA}/{FACTORIZATION}", sep=",", encoding="latin1")

In [3]:
# FUNCIONES
def factorization_to_ratings(df_exercises: pd.DataFrame, df_matrix: pd.DataFrame)-> pd.DataFrame:
    rows = []
    for _, row_matrix in df_matrix.iterrows():
        user_id = row_matrix['rut']
        for exercise in df_matrix.columns[1:]:
            if row_matrix[exercise] == 1:
                row_exercise = df_exercises.iloc[int(exercise.lstrip('e'))]
                new_row = {'rut': user_id}
                new_row.update(row_exercise.to_dict())
                rows.append(new_row)
    df = pd.DataFrame(rows)      
    return df

In [4]:
ejercicios.head()

Unnamed: 0,oid,nombre,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,CÃ¡lculo del dÃ­gito verificador del rut,0,0,0,1,0,0,0,1,0,1,1,1,279
1,16f619db31204ded9418136c4587ddd8,Calculadora GeomÃ©trica,0,0,1,0,0,0,0,1,0,0,0,1,529
2,17022c9ceac94ec5b2e7bc934c7b2d6f,Subsecuencias de ADN,0,0,1,0,0,0,1,1,0,1,1,1,567
3,171b5e86d4fb47268f2692587fbec073,NÃºmeros Primos,0,0,0,1,0,0,0,0,0,0,1,1,259
4,2437df93d3f44a87b00834072aeb1ab0,Nota Final,0,0,0,1,0,0,0,0,0,0,0,1,257


In [5]:
df_ejercicios = ejercicios.drop(labels=['nombre'], axis=1)
df_ejercicios.head()

Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
1,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,529
2,17022c9ceac94ec5b2e7bc934c7b2d6f,0,0,1,0,0,0,1,1,0,1,1,1,567
3,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,259
4,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257


In [6]:
exercises_users = factorization_to_ratings(df_ejercicios, matrix_factorization)
exercises_users.head(10)

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
1,0,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,529
2,0,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,259
3,0,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257
4,0,29f15ef8dc32426f945f64e28c910a57,0,0,1,0,0,0,0,1,0,0,1,1,531
5,0,46850a246d48484b8f104f8aab5679b6,0,0,0,1,0,0,0,0,0,0,1,1,259
6,0,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,258
7,0,729d37da8f2d46f3af2d891df04949ef,0,0,0,1,0,0,0,0,0,0,1,1,259
8,0,7f60644b0a1b484681ae5c8e36166c58,0,0,0,1,0,0,0,1,0,0,0,1,273
9,0,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,258


In [7]:
normalizador = MinMaxScaler()
exercises_users['score_normalized'] = normalizador.fit_transform(exercises_users[['score']])
exercises_users.drop(labels=['score'], axis=1, inplace=True)
exercises_users.head(10)

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score_normalized
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,0.064327
1,0,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,0.795322
2,0,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
3,0,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,0.0
4,0,29f15ef8dc32426f945f64e28c910a57,0,0,1,0,0,0,0,1,0,0,1,1,0.80117
5,0,46850a246d48484b8f104f8aab5679b6,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
6,0,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,0.002924
7,0,729d37da8f2d46f3af2d891df04949ef,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
8,0,7f60644b0a1b484681ae5c8e36166c58,0,0,0,1,0,0,0,1,0,0,0,1,0.046784
9,0,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,0.002924


In [8]:
# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = train_test_split(exercises_users, test_size=0.2, random_state=42)
train_data.head(10)

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score_normalized
3519,273,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,0.002924
6041,457,46850a246d48484b8f104f8aab5679b6,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
8933,1209,52620b0c858a4c59bc324b65278d28bd,0,0,0,1,0,0,0,1,0,0,1,1,0.052632
8991,1213,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
6426,949,7f60644b0a1b484681ae5c8e36166c58,0,0,0,1,0,0,0,1,0,0,0,1,0.046784
111,13,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,0.0
2692,209,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,0.064327
9375,1253,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,0.002924
2715,210,89f44e7f5842479fb283e43c52ce067b,0,0,0,1,0,0,0,0,0,0,0,1,0.0
7023,1015,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,0.005848


In [9]:
# Preparar datos de entrada y salida
X_train = train_data.drop(['rut', 'oid'], axis=1).values
Y_train = train_data['rut'].values

X_test = test_data.drop(['rut', 'oid'], axis=1).values
Y_test = test_data['rut'].values

In [10]:
# Definir el modelo
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='linear')  # salida de regresión
])

# Compilar el modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Resumen del modelo
model.summary()

2024-07-10 01:29:51.470453: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 01:29:51.503250: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 01:29:51.503311: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 01:29:51.505703: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 01:29:51.505782: I external/local_xla/xla/stream_executor

In [11]:
# Entrenar el modelo
history = model.fit(X_train, Y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50


I0000 00:00:1720589392.460202  141961 service.cc:145] XLA service 0x7f303401fc00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1720589392.460238  141961 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-07-10 01:29:52.479311: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-10 01:29:52.574591: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902


[1m124/198[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 1ms/step - loss: 515113.7500

I0000 00:00:1720589393.972929  141961 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 513392.5938 - val_loss: 388944.2500
Epoch 2/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 329072.9062 - val_loss: 197448.3750
Epoch 3/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 208869.7344 - val_loss: 185429.8750
Epoch 4/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 202562.3438 - val_loss: 182539.1875
Epoch 5/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 194977.7969 - val_loss: 181346.8594
Epoch 6/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 187761.0781 - val_loss: 180640.6562
Epoch 7/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 191541.4844 - val_loss: 180240.9375
Epoch 8/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 195986

In [12]:
# Evaluación del modelo
loss = model.evaluate(X_test, Y_test)
print(f'Loss en el conjunto de prueba: {loss}')

# Utilizar el modelo para hacer predicciones
predictions = model.predict(X_test)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 175390.2344
Loss en el conjunto de prueba: 177644.046875
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


# PREDICCIONES

In [13]:
# Random User
random_user_id = random.randint(1, exercises_users['rut'].unique().max())
print(f'User: {random_user_id}')

User: 502


In [14]:
print(f'Ejercicios realizados por el usuario [ {random_user_id} ]')
items_interacted = exercises_users[exercises_users['rut'] == random_user_id]['oid'].values
df_ejercicios[df_ejercicios['oid'].isin(items_interacted)].sort_values(by='score', ascending=True)

Ejercicios realizados por el usuario [ 502 ]


Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score


In [15]:
# Obtener ítems no interactuados
items_not_interacted = df_ejercicios[~df_ejercicios['oid'].isin(items_interacted)]

# Normalizar características numéricas
normalizar = MinMaxScaler()
items_not_interacted.loc[:, 'score'] = normalizar.fit_transform(items_not_interacted[['score']]).astype('float64')

In [16]:
# Predecir preferencias para ítems no interactuados
X_recommend = items_not_interacted.drop('oid', axis=1).values
predicted_preferences = model.predict(X_recommend).flatten()

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [17]:
# Ordenar ítems por preferencia predicha y recomendar los mejores
recommendations = items_not_interacted.copy()
recommendations['predicted_preference'] = predicted_preferences
recommendations = recommendations.sort_values(by='predicted_preference', ascending=False).head(10)

In [18]:
print('Recomendaciones para el usuario', random_user_id)
print(recommendations[['oid', 'predicted_preference']])

Recomendaciones para el usuario 502
                                 oid  predicted_preference
0   0973dae0e1b74ab8baa8d94339ee3ae6            651.049255
4   2437df93d3f44a87b00834072aeb1ab0            644.856018
25  89f44e7f5842479fb283e43c52ce067b            644.856018
7   31ea1c1b12174428b5a67a6576627de9            644.856018
17  718578451f3f4eca87437cadfe98d688            636.984558
23  80c61dae74fa4915bf272ab17dfa62ff            636.984558
22  7f60644b0a1b484681ae5c8e36166c58            635.807068
42  d8395f43e4a1454d90346ac5a1ba561a            597.986938
32  ac7382763e484d37908da54c076f7577            597.986938
29  a3963220090f4e50a266ce53d33b9841            597.986938


In [19]:
df_ejercicios[df_ejercicios['oid'].isin(recommendations['oid'].values)].sort_values(by='score', ascending=True)

Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
4,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257
7,31ea1c1b12174428b5a67a6576627de9,0,0,0,1,0,0,0,0,0,0,0,1,257
25,89f44e7f5842479fb283e43c52ce067b,0,0,0,1,0,0,0,0,0,0,0,1,257
17,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,258
23,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,258
29,a3963220090f4e50a266ce53d33b9841,0,0,0,1,0,0,0,0,0,0,1,1,259
32,ac7382763e484d37908da54c076f7577,0,0,0,1,0,0,0,0,0,0,1,1,259
42,d8395f43e4a1454d90346ac5a1ba561a,0,0,0,1,0,0,0,0,0,0,1,1,259
22,7f60644b0a1b484681ae5c8e36166c58,0,0,0,1,0,0,0,1,0,0,0,1,273
0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
