<a href="https://colab.research.google.com/github/emarod/Tesis/blob/main/tesis_yoochoose_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import networkx as nx
import numpy as np

In [None]:
opt = {}
opt["dataset"] = "yoochoose_1_64_weights_pandas"
opt["batchSize"] = 100
opt["hiddenSize"] = 100
opt["epoch"] = 30
opt["lr"] = 0.001
opt["lr_dc"] = 0.1
opt["lr_dc_step"] = 3
opt["l2"] = 1e-5
opt["step"] = 1
opt["patience"] = 10
opt["nonhybrid"] = "store_true"
opt["validation"] = False
opt["valid_portion"] = 0.1
opt["seed"] = 42

In [None]:
opt["dataset"]

'yoochoose_25_weights_pandas_whole_dataset'

In [None]:
def data_masks(all_usr_pois, item_tail):
    us_lens = [len(upois) for upois in all_usr_pois]
    len_max = max(us_lens)
    us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
    us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
    return us_pois, us_msks, len_max

In [None]:
def split_validation(train_set, valid_portion):
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = np.arange(n_samples, dtype='int32')
    np.random.shuffle(sidx)
    n_train = int(np.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    return (train_set_x, train_set_y), (valid_set_x, valid_set_y)

In [None]:
class Data():
    def __init__(self, data, shuffle=False, graph=None):
        inputs = data[0]
        times = data[2]
        times, _, _ = data_masks(times, [0])
        self.times = np.asarray(times)
        inputs, mask, len_max = data_masks(inputs, [0]) # padding
        self.inputs = np.asarray(inputs)
        self.mask = np.asarray(mask)
        self.len_max = len_max
        self.targets = np.asarray(data[1])
        self.length = len(inputs)
        self.shuffle = shuffle
        self.graph = graph

    def generate_batch(self, batch_size):
        if self.shuffle:
            shuffled_arg = np.arange(self.length)
            np.random.shuffle(shuffled_arg)
            self.inputs = self.inputs[shuffled_arg]
            self.mask = self.mask[shuffled_arg]
            self.targets = self.targets[shuffled_arg]
            self.times = self.times[shuffled_arg] # new addition
        n_batch = int(self.length / batch_size)
        if self.length % batch_size != 0:
            n_batch += 1
        slices = np.split(np.arange(n_batch * batch_size), n_batch)
        slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))]
        return slices

    def get_slice(self, i):
        inputs, mask, targets, times = self.inputs[i], self.mask[i], self.targets[i], self.times[i]
        items, n_node, A, alias_inputs = [], [], [], []
        for u_input in inputs:
            n_node.append(len(np.unique(u_input)))
        max_n_node = np.max(n_node)
        for u_input, u_times in zip(inputs, times):
            node = np.unique(u_input)
            # u_times_normalized = np.log(u_times + 1) # new addition
            u_times_normalized = u_times # new addition

            # new addition
            # Necesitamos el tiempo de permanencia asociado a cada nodo único.
            node_time_map = {}
            for item_id, time_val in zip(u_input, u_times_normalized):
              if item_id != 0: # Ignorar padding
                # Aquí usamos el último tiempo de permanencia si el ítem se repite,
                # ya que la matriz A se construye sobre los nodos únicos.
                node_time_map[item_id] = time_val

            items.append(node.tolist() + (max_n_node - len(node)) * [0])
            u_A = np.zeros((max_n_node, max_n_node))
            for i in np.arange(len(u_input) - 1):
                if u_input[i + 1] == 0:
                    break
                u = np.where(node == u_input[i])[0][0]
                v = np.where(node == u_input[i + 1])[0][0]
                weight = node_time_map[u_input[i]]
                # u_A[u][v] = 1
                u_A[u][v] = weight
            u_sum_in = np.sum(u_A, 0)
            u_sum_in[np.where(u_sum_in == 0)] = 1
            u_A_in = np.divide(u_A, u_sum_in)
            u_sum_out = np.sum(u_A, 1)
            u_sum_out[np.where(u_sum_out == 0)] = 1
            u_A_out = np.divide(u_A.transpose(), u_sum_out)
            u_A = np.concatenate([u_A_in, u_A_out]).transpose()
            A.append(u_A)
            alias_inputs.append([np.where(node == i)[0][0] for i in u_input])
        return alias_inputs, A, items, mask, targets

In [None]:
import datetime
import math
import numpy as np
import torch
from torch import nn
from torch.nn import Module, Parameter
import torch.nn.functional as F

In [None]:
class GNN(Module):
    def __init__(self, hidden_size, step=1):
        super(GNN, self).__init__()
        self.step = step
        self.hidden_size = hidden_size
        self.input_size = hidden_size * 2
        self.gate_size = 3 * hidden_size
        self.w_ih = Parameter(torch.Tensor(self.gate_size, self.input_size))
        self.w_hh = Parameter(torch.Tensor(self.gate_size, self.hidden_size))
        self.b_ih = Parameter(torch.Tensor(self.gate_size))
        self.b_hh = Parameter(torch.Tensor(self.gate_size))
        self.b_iah = Parameter(torch.Tensor(self.hidden_size))
        self.b_oah = Parameter(torch.Tensor(self.hidden_size))

        self.linear_edge_in = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_out = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_f = nn.Linear(self.hidden_size, self.hidden_size, bias=True)

    def GNNCell(self, A, hidden):
        input_in = torch.matmul(A[:, :, :A.shape[1]], self.linear_edge_in(hidden)) + self.b_iah
        input_out = torch.matmul(A[:, :, A.shape[1]: 2 * A.shape[1]], self.linear_edge_out(hidden)) + self.b_oah
        inputs = torch.cat([input_in, input_out], 2)
        gi = F.linear(inputs, self.w_ih, self.b_ih)
        gh = F.linear(hidden, self.w_hh, self.b_hh)
        i_r, i_i, i_n = gi.chunk(3, 2)
        h_r, h_i, h_n = gh.chunk(3, 2)
        resetgate = torch.sigmoid(i_r + h_r)
        inputgate = torch.sigmoid(i_i + h_i)
        newgate = torch.tanh(i_n + resetgate * h_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy

    def forward(self, A, hidden):
        for i in range(self.step):
            hidden = self.GNNCell(A, hidden)
        return hidden


class SessionGraph(Module):
    def __init__(self, opt, n_node):
        super(SessionGraph, self).__init__()
        self.hidden_size = opt["hiddenSize"]
        self.n_node = n_node
        self.batch_size = opt["batchSize"]
        self.nonhybrid = opt["nonhybrid"]
        self.embedding = nn.Embedding(self.n_node, self.hidden_size)
        self.gnn = GNN(self.hidden_size, step=opt["step"])
        self.linear_one = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_two = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_three = nn.Linear(self.hidden_size, 1, bias=False)
        self.linear_transform = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=True)
        self.loss_function = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=opt["lr"], weight_decay=opt["l2"])
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt["lr_dc_step"], gamma=opt["lr_dc"])
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def compute_scores(self, hidden, mask):
        ht = hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1]  # batch_size x latent_size
        q1 = self.linear_one(ht).view(ht.shape[0], 1, ht.shape[1])  # batch_size x 1 x latent_size
        q2 = self.linear_two(hidden)  # batch_size x seq_length x latent_size
        alpha = self.linear_three(torch.sigmoid(q1 + q2))
        a = torch.sum(alpha * hidden * mask.view(mask.shape[0], -1, 1).float(), 1)
        if not self.nonhybrid:
            a = self.linear_transform(torch.cat([a, ht], 1))
        b = self.embedding.weight[1:]  # n_nodes x latent_size
        scores = torch.matmul(a, b.transpose(1, 0))
        return scores

    def forward(self, inputs, A):
        hidden = self.embedding(inputs)
        hidden = self.gnn(A, hidden)
        return hidden


def trans_to_cuda(variable):
    if torch.cuda.is_available():
        return variable.cuda()
    else:
        return variable


def trans_to_cpu(variable):
    if torch.cuda.is_available():
        return variable.cpu()
    else:
        return variable


def forward(model, i, data):
    alias_inputs, A, items, mask, targets = data.get_slice(i)
    alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
    items = trans_to_cuda(torch.Tensor(items).long())
    A = trans_to_cuda(torch.Tensor(A).float())
    mask = trans_to_cuda(torch.Tensor(mask).long())
    hidden = model(items, A)
    get = lambda i: hidden[i][alias_inputs[i]]
    seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
    return targets, model.compute_scores(seq_hidden, mask)


def train_test(model, train_data, test_data):
    model.scheduler.step()
    print('start training: ', datetime.datetime.now())
    model.train()
    total_loss = 0.0
    slices = train_data.generate_batch(model.batch_size)
    for i, j in zip(slices, np.arange(len(slices))):
        model.optimizer.zero_grad()
        targets, scores = forward(model, i, train_data)
        targets = trans_to_cuda(torch.Tensor(targets).long())
        loss = model.loss_function(scores, targets - 1)
        loss.backward()
        model.optimizer.step()
        total_loss += loss
        if j % int(len(slices) / 5 + 1) == 0:
            print('[%d/%d] Loss: %.4f' % (j, len(slices), loss.item()))
    print('\tLoss:\t%.3f' % total_loss)

    print('start predicting: ', datetime.datetime.now())
    model.eval()
    hit, mrr = [], []
    slices = test_data.generate_batch(model.batch_size)
    for i in slices:
        targets, scores = forward(model, i, test_data)
        sub_scores = scores.topk(20)[1]
        sub_scores = trans_to_cpu(sub_scores).detach().numpy()
        for score, target, mask in zip(sub_scores, targets, test_data.mask):
            hit.append(np.isin(target - 1, score))
            if len(np.where(score == target - 1)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    return hit, mrr

In [None]:
import pickle
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Después de montar Google Drive, puedes navegar por tus archivos. Aquí te muestro cómo listar el contenido de la raíz de tu Drive para empezar:

In [None]:
import os
os.listdir('/content/drive/My Drive/Tesis/data/yoochoose_1_64_weights_pandas')

['all_train_seq.txt', 'test.txt', 'train.txt']

Una vez que hayas localizado tu archivo, por ejemplo, un CSV llamado `mi_archivo.csv` dentro de una carpeta `MisDatos` en tu Drive, puedes cargarlo con pandas de la siguiente manera:

In [None]:
data_path = "/content/drive/My Drive/Tesis/data/"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)

Device cuda


In [None]:
train_data = pickle.load(open(data_path + opt["dataset"] + '/train.txt', 'rb'))
if opt["validation"]:
    train_data, valid_data = split_validation(train_data, opt["valid_portion"])
    test_data = valid_data
else:
    test_data = pickle.load(open(data_path + opt["dataset"] + '/test.txt', 'rb'))
# all_train_seq = pickle.load(open('../datasets/' + opt["dataset"] + '/all_train_seq.txt', 'rb'))
# g = build_graph(all_train_seq)
train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)
# del all_train_seq, g
if opt["dataset"] == 'diginetica_25' or opt["dataset"] == 'diginetica_25_weights':
    n_node = 43098
elif opt["dataset"] == 'yoochoose_25_weights' or opt["dataset"] == 'yoochoose1_4' or opt["dataset"] == "yoochoose_25_weights_pandas_whole_dataset":
    n_node = 37484
else:
    n_node = 310
    n_node = 43098

model = trans_to_cuda(SessionGraph(opt, n_node))

In [None]:
print("Starting Trainning")
start = time.time()
best_result = [0, 0]
best_epoch = [0, 0]
bad_counter = 0
for epoch in range(opt["epoch"]):
    print('-------------------------------------------------------')
    print('epoch: ', epoch)
    hit, mrr = train_test(model, train_data, test_data)
    flag = 0
    if hit >= best_result[0]:
        best_result[0] = hit
        best_epoch[0] = epoch
        flag = 1
    if mrr >= best_result[1]:
        best_result[1] = mrr
        best_epoch[1] = epoch
        flag = 1
    print('Best Result:')
    print('\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d /,\t%d'% (best_result[0], best_result[1], epoch, opt["epoch"]))
    bad_counter += 1 - flag
    if bad_counter >= opt["patience"]:
        print("Program stopped by patiente parameter")
        break
print('-------------------------------------------------------')
end = time.time()
print("Run time: %f s" % (end - start))

Starting Trainning
-------------------------------------------------------
epoch:  0
start training:  2025-12-03 17:55:00.224618


  A = trans_to_cuda(torch.Tensor(A).float())


[0/3699] Loss: 10.6727
[740/3699] Loss: 5.4994
[1480/3699] Loss: 5.0951
[2220/3699] Loss: 5.2759
[2960/3699] Loss: 4.6868


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:836.)
  print('\tLoss:\t%.3f' % total_loss)


	Loss:	20388.629
start predicting:  2025-12-03 18:04:04.966533
Best Result:
	Recall@20:	68.2529	MMR@20:	28.0152	Epoch:	0 /,	30
-------------------------------------------------------
epoch:  1
start training:  2025-12-03 18:04:34.329346
[0/3699] Loss: 4.1727
[740/3699] Loss: 4.2363
[1480/3699] Loss: 4.5280
[2220/3699] Loss: 4.5006
[2960/3699] Loss: 4.3483
	Loss:	16379.955
start predicting:  2025-12-03 18:13:38.685412
Best Result:
	Recall@20:	69.3477	MMR@20:	29.0453	Epoch:	1 /,	30
-------------------------------------------------------
epoch:  2
start training:  2025-12-03 18:14:07.496721
[0/3699] Loss: 4.6472
[740/3699] Loss: 3.6813
[1480/3699] Loss: 3.9340
[2220/3699] Loss: 3.8919
[2960/3699] Loss: 3.8106
	Loss:	14518.952
start predicting:  2025-12-03 18:23:13.088065
Best Result:
	Recall@20:	70.4981	MMR@20:	29.7066	Epoch:	2 /,	30
-------------------------------------------------------
epoch:  3
start training:  2025-12-03 18:23:41.867592
[0/3699] Loss: 4.5071
[740/3699] Loss: 3.8002
[