In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import warnings
warnings.filterwarnings('ignore')


# SEMILLA
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

2024-07-06 02:43:29.571694: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 02:43:29.583638: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 02:43:29.597941: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-06 02:43:29.597974: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-06 02:43:29.606746: I tensorflow/core/platform/cpu_feature_gua

In [2]:
# RUTA 
DIR_DATA_LIMPIA = '../datos/data-limpia'
FACTORIZATION = 'matriz-de-factorizacion.csv'
EJERCICIOS = 'catalogo-de-ejercicios.csv'

# IMPORTAR DATA
ejercicios = pd.read_csv(f"{DIR_DATA_LIMPIA}/{EJERCICIOS}", sep=",", encoding="latin1")
matrix_factorization = pd.read_csv(f"{DIR_DATA_LIMPIA}/{FACTORIZATION}", sep=",", encoding="latin1")

In [3]:
def factorization_to_ratings(df_exercises: pd.DataFrame, df_matrix: pd.DataFrame)-> pd.DataFrame:
    rows = []
    for _, row_matrix in df_matrix.iterrows():
        user_id = row_matrix['rut']
        for exercise in df_matrix.columns[1:]:
            if row_matrix[exercise] == 1:
                row_exercise = df_exercises.iloc[int(exercise.lstrip('e'))]
                new_row = {'rut': user_id}
                new_row.update(row_exercise.to_dict())
                rows.append(new_row)
    df = pd.DataFrame(rows)      
    return df

In [4]:
ejercicios['score'] = ejercicios.iloc[:, 2:14].apply(lambda x: int(''.join(map(str, x)), 2), axis=1)
df_ejercicios = ejercicios.drop(labels=['nombre'], axis=1)
df_ejercicios

Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
1,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,529
2,17022c9ceac94ec5b2e7bc934c7b2d6f,0,0,1,0,0,0,1,1,0,1,1,1,567
3,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,259
4,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257
5,24b87304a1eb4e95856da7f574a66fe8,1,0,0,0,1,0,0,0,0,1,1,1,2183
6,29f15ef8dc32426f945f64e28c910a57,0,0,1,0,0,0,0,1,0,0,1,1,531
7,31ea1c1b12174428b5a67a6576627de9,0,0,0,1,0,0,0,0,0,0,0,1,257
8,37e4f4a1e8174e9496d21b00d67fc8f1,0,0,1,0,0,0,1,1,0,1,1,1,567
9,3d7d9e64ac0846caadc08dccf2538f55,0,0,1,0,0,0,1,0,0,1,1,1,551


In [5]:
exercises_users = factorization_to_ratings(df_ejercicios, matrix_factorization)
exercises_users

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
1,0,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,529
2,0,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,259
3,0,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257
4,0,29f15ef8dc32426f945f64e28c910a57,0,0,1,0,0,0,0,1,0,0,1,1,531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12720,1304,ac7382763e484d37908da54c076f7577,0,0,0,1,0,0,0,0,0,0,1,1,259
12721,1304,baf2f8e0167a4e089d2cec16582c9ae9,0,0,0,1,0,0,0,1,0,0,1,1,275
12722,1304,d8395f43e4a1454d90346ac5a1ba561a,0,0,0,1,0,0,0,0,0,0,1,1,259
12723,1305,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,258


In [6]:
normalizador = MinMaxScaler()
exercises_users['score_normalized'] = normalizador.fit_transform(exercises_users[['score']])
exercises_users.drop(labels=['score'], axis=1, inplace=True)
exercises_users

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score_normalized
0,0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,0.064327
1,0,16f619db31204ded9418136c4587ddd8,0,0,1,0,0,0,0,1,0,0,0,1,0.795322
2,0,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
3,0,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,0.000000
4,0,29f15ef8dc32426f945f64e28c910a57,0,0,1,0,0,0,0,1,0,0,1,1,0.801170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12720,1304,ac7382763e484d37908da54c076f7577,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
12721,1304,baf2f8e0167a4e089d2cec16582c9ae9,0,0,0,1,0,0,0,1,0,0,1,1,0.052632
12722,1304,d8395f43e4a1454d90346ac5a1ba561a,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
12723,1305,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,0.002924


In [7]:
# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = train_test_split(exercises_users, test_size=0.2, random_state=42)
train_data

Unnamed: 0,rut,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score_normalized
9129,930,171b5e86d4fb47268f2692587fbec073,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
9500,975,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,0.002924
4700,352,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,0.064327
3035,228,db7987d040dc469a9c247d54dd72939a,0,0,1,0,0,0,0,1,0,0,1,1,0.801170
1011,78,46850a246d48484b8f104f8aab5679b6,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,1225,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,0.002924
5191,385,8f24397e36034cccb71e9d578975c33d,0,0,0,1,0,0,0,0,0,0,1,1,0.005848
5390,398,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,0.064327
860,70,9e02c76901da440ba1b70f0a75d8ae89,0,0,1,0,0,0,1,0,0,1,1,1,0.859649


In [8]:
# Preparar datos de entrada y salida
X_train = train_data.drop(['rut', 'oid'], axis=1).values
Y_train = train_data['rut'].values

X_test = test_data.drop(['rut', 'oid'], axis=1).values
Y_test = test_data['rut'].values

In [11]:
# Definir el modelo
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='linear')  # salida de regresión
])

# Compilar el modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Resumen del modelo
model.summary()

In [12]:
# Entrenar el modelo
history = model.fit(X_train, Y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50


I0000 00:00:1720248340.481931  130334 service.cc:145] XLA service 0x7f0fcc003970 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1720248340.481969  130334 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-07-06 02:45:40.498322: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-06 02:45:40.574506: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902


[1m144/255[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 1ms/step - loss: 508728.2188

I0000 00:00:1720248342.197391  130334 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 492270.5625 - val_loss: 250389.5000
Epoch 2/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 202411.1250 - val_loss: 160530.0469
Epoch 3/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 168095.0781 - val_loss: 155704.7812
Epoch 4/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 160849.3281 - val_loss: 154273.0156
Epoch 5/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 161592.2812 - val_loss: 153331.7969
Epoch 6/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 162396.0156 - val_loss: 152631.5156
Epoch 7/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 159528.0156 - val_loss: 151910.0938
Epoch 8/50
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 161757

In [13]:
# Evaluación del modelo
loss = model.evaluate(X_test, Y_test)
print(f'Loss en el conjunto de prueba: {loss}')

# Utilizar el modelo para hacer predicciones
predictions = model.predict(X_test)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 144914.0156
Loss en el conjunto de prueba: 146298.4375
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


# PREDICCIONES

In [14]:
# Random User
random_user_id = random.randint(1, exercises_users['rut'].unique().max())
print(f'User: {random_user_id}')

User: 1117


In [15]:
print(f'Ejercicios realizados por el usuario [ {random_user_id} ]')
items_interacted = exercises_users[exercises_users['rut'] == random_user_id]['oid'].values
df_ejercicios[df_ejercicios['oid'].isin(items_interacted)]

Ejercicios realizados por el usuario [ 1117 ]


Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score


In [16]:
# Obtener ítems no interactuados
items_not_interacted = df_ejercicios[~df_ejercicios['oid'].isin(items_interacted)]

# Normalizar características numéricas
normalizar = MinMaxScaler()
items_not_interacted.loc[:, 'score'] = normalizar.fit_transform(items_not_interacted[['score']]).astype('float64')

In [17]:
# Predecir preferencias para ítems no interactuados
X_recommend = items_not_interacted.drop('oid', axis=1).values
predicted_preferences = model.predict(X_recommend).flatten()

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step


In [18]:
# Ordenar ítems por preferencia predicha y recomendar los mejores
recommendations = items_not_interacted.copy()
recommendations['predicted_preference'] = predicted_preferences
recommendations = recommendations.sort_values(by='predicted_preference', ascending=False).head(10)

In [19]:
print('Recomendaciones para el usuario', random_user_id)
print(recommendations[['oid', 'predicted_preference']])

Recomendaciones para el usuario 1117
                                 oid  predicted_preference
4   2437df93d3f44a87b00834072aeb1ab0            650.973511
25  89f44e7f5842479fb283e43c52ce067b            650.973511
7   31ea1c1b12174428b5a67a6576627de9            650.973511
17  718578451f3f4eca87437cadfe98d688            646.950500
23  80c61dae74fa4915bf272ab17dfa62ff            646.950500
22  7f60644b0a1b484681ae5c8e36166c58            635.105103
0   0973dae0e1b74ab8baa8d94339ee3ae6            634.031250
42  d8395f43e4a1454d90346ac5a1ba561a            592.121948
32  ac7382763e484d37908da54c076f7577            592.121948
29  a3963220090f4e50a266ce53d33b9841            592.121948


In [20]:
df_ejercicios[df_ejercicios['oid'].isin(recommendations['oid'].values)]

Unnamed: 0,oid,h4,h3,h2,h1,s4,s3,s2,s1,k4,k3,k2,k1,score
0,0973dae0e1b74ab8baa8d94339ee3ae6,0,0,0,1,0,0,0,1,0,1,1,1,279
4,2437df93d3f44a87b00834072aeb1ab0,0,0,0,1,0,0,0,0,0,0,0,1,257
7,31ea1c1b12174428b5a67a6576627de9,0,0,0,1,0,0,0,0,0,0,0,1,257
17,718578451f3f4eca87437cadfe98d688,0,0,0,1,0,0,0,0,0,0,1,0,258
22,7f60644b0a1b484681ae5c8e36166c58,0,0,0,1,0,0,0,1,0,0,0,1,273
23,80c61dae74fa4915bf272ab17dfa62ff,0,0,0,1,0,0,0,0,0,0,1,0,258
25,89f44e7f5842479fb283e43c52ce067b,0,0,0,1,0,0,0,0,0,0,0,1,257
29,a3963220090f4e50a266ce53d33b9841,0,0,0,1,0,0,0,0,0,0,1,1,259
32,ac7382763e484d37908da54c076f7577,0,0,0,1,0,0,0,0,0,0,1,1,259
42,d8395f43e4a1454d90346ac5a1ba561a,0,0,0,1,0,0,0,0,0,0,1,1,259
