In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

plt.style.use('dark_background')

In [2]:
D = np.load('D_matrix.npy', mmap_mode='r')
D.shape

(27445, 18)

In [3]:
import torch
from abc import abstractmethod


class BaseLabelPropagation:
    """Base class for label propagation models.
    
    Parameters
    ----------
    adj_matrix: torch.FloatTensor
        Adjacency matrix of the graph.
    """
    def __init__(self, adj_matrix):
        self.norm_adj_matrix = self._normalize(adj_matrix)
        self.n_nodes = adj_matrix.size(0)
        self.one_hot_labels = None 
        self.n_classes = None
        self.labeled_mask = None
        self.predictions = None

    @staticmethod
    @abstractmethod
    def _normalize(adj_matrix):
        raise NotImplementedError("_normalize must be implemented")

    @abstractmethod
    def _propagate(self):
        raise NotImplementedError("_propagate must be implemented")

    def _one_hot_encode(self, labels):        
        # Get the number of classes
        classes = torch.arange(0, 11)
        classes = classes[classes != -1]
        self.n_classes = classes.size(0)

        # One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
        unlabeled_mask = (labels == -1)
        labels = labels.clone()  # defensive copying
        labels[unlabeled_mask] = 0
        self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
        
        self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
        self.one_hot_labels[unlabeled_mask, 0] = 0

        self.labeled_mask = ~unlabeled_mask

    def fit(self, labels, max_iter, tol):
        """Fits a semi-supervised learning label propagation model.
        
        labels: torch.LongTensor
            Tensor of size n_nodes indicating the class number of each node.
            Unlabeled nodes are denoted with -1.
        max_iter: int
            Maximum number of iterations allowed.
        tol: float
            Convergence tolerance: threshold to consider the system at steady state.
        """
        self._one_hot_encode(labels)

        self.predictions = self.one_hot_labels.clone()
        prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)

        for i in range(max_iter):
            # Stop iterations if the system is considered at a steady state
            variation = torch.abs(self.predictions - prev_predictions).sum().item()
            
            if variation < tol:
                print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
                break

            prev_predictions = self.predictions
            self._propagate()

    def predict(self):
        return self.predictions

    def predict_classes(self):
        return self.predictions.max(dim=1).indices

In [4]:
class LabelPropagation(BaseLabelPropagation):
    def __init__(self, adj_matrix):
        super().__init__(adj_matrix)

    @staticmethod
    def _normalize(adj_matrix):
        """Computes D^-1 * W"""
        degs = adj_matrix.sum(dim=1)
        degs[degs == 0] = 1  # avoid division by 0 error
        return adj_matrix / degs[:, None]

    def _propagate(self):
        self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)

        # Put back already known labels
        self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]

    def fit(self, labels, max_iter=1000, tol=1e-3):
        super().fit(labels, max_iter, tol)

In [5]:
class LabelSpreading(BaseLabelPropagation):
    def __init__(self, adj_matrix):
        super().__init__(adj_matrix)
        self.alpha = None

    @staticmethod
    def _normalize(adj_matrix):
        """Computes D^-1/2 * W * D^-1/2"""
        degs = adj_matrix.sum(dim=1)
        norm = torch.pow(degs, -0.5)
        norm[torch.isinf(norm)] = 1
        return adj_matrix * norm[:, None] * norm[None, :]

    def _propagate(self):
        self.predictions = (
            self.alpha * torch.matmul(self.norm_adj_matrix, self.predictions)
            + (1 - self.alpha) * self.one_hot_labels
        )
        #self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
    
    def fit(self, labels, max_iter=1000, tol=1e-3, alpha=0.5):
        """
        Parameters
        ----------
        alpha: float
            Clamping factor.
        """
        self.alpha = alpha
        super().fit(labels, max_iter, tol)

In [6]:
num_users = int(0.05 * D.shape[0])
num_users

1372

In [7]:
import networkx as nx

def initialize_population(n, p=0.1, seed=42):
    population = []
    
    for i in range(n):
        G = nx.erdos_renyi_graph(n=num_users, p=p, directed=False, seed=seed+i)
        W = nx.to_numpy_array(G, dtype=np.float32)
        W = W / W.sum(axis=1, keepdims=True)
        W = np.maximum(W, W.T)
        np.fill_diagonal(W, 0.)
        population.append(W)
    
    return population

In [9]:
population = initialize_population(n=50)
population

[array([[0.        , 0.        , 0.00775194, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00775194, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.0078125 ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.0078125 , 0.        ,
         0.        ]], dtype=float32),
 array([[0.        , 0.00746269, 0.        , ..., 0.        , 0.        ,
         0.00793651],
        [0.00746269, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.00793651],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        

In [None]:
def get_fitness(movie_data, W, test_size=0.2):
    def get_test_indices(mevie, test_size):
        test_indices = []

        for movie in movie_data:
            non_zero_indices = np.nonzero(movie)[0]
            
            test_size = int(test_size * len(non_zero_indices))
            
            if test_size > 0:
                random_indices = np.random.choice(non_zero_indices, size=test_size, replace=False)
            else:
                random_indices = []
                
            test_indices.append(random_indices)        
        
        return test_indices
    
    test_indices = get_test_indices(D, test_size)
    adj_matrix_t = torch.FloatTensor(W)

    for i in range(movie_data.shape[1]):
        labels_t = torch.LongTensor( original_data[i, :W.shape[0]] )
        
        if len(test_indices[i]):
            labels_t[test_indices[i]] = 0
        
        labels_t[labels_t == 0] = -1
            
        label_spreading = LabelSpreading(adj_matrix_t)
        label_spreading.fit(labels_t, alpha=0.8)
        label_spreading_output_labels = label_spreading.predict_classes()

        movie_ratings[i, :] = label_spreading_output_labels
    
    

In [None]:
for i in range(len(population)):
    original_data = D.copy()[:population[i].shape[0], :].T
    movie_ratings = np.empty_like(original_data)
    fitness = get_fitness(original_data, population[i])