In [1]:
import time
import torch
import numpy as np
from numba import jit
import networkx as nx
from matplotlib import pyplot as plt

plt.style.use('dark_background')

In [2]:
D = np.load('D_matrix.npy', mmap_mode='r')
D.shape

(27445, 18)

In [3]:
from abc import abstractmethod

class BaseLabelPropagation:
    """Base class for label propagation models.
    
    Parameters
    ----------
    adj_matrix: torch.FloatTensor
        Adjacency matrix of the graph.
    """
    def __init__(self, adj_matrix):
        self.norm_adj_matrix = self._normalize(adj_matrix)
        self.n_nodes = adj_matrix.size(0)
        self.one_hot_labels = None 
        self.n_classes = None
        self.labeled_mask = None
        self.predictions = None

    @staticmethod
    @abstractmethod
    def _normalize(adj_matrix):
        raise NotImplementedError("_normalize must be implemented")

    @abstractmethod
    def _propagate(self):
        raise NotImplementedError("_propagate must be implemented")

    def _one_hot_encode(self, labels):        
        # Get the number of classes
        classes = torch.arange(0, 11, device=labels.device)
        classes = classes[classes != -1]
        self.n_classes = classes.size(0)

        # One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
        unlabeled_mask = (labels == -1)
        #labels = labels.clone()  # defensive copying
        labels[unlabeled_mask] = 0
        self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float, device=labels.device)
        
        self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
        self.one_hot_labels[unlabeled_mask, 0] = 0

        self.labeled_mask = ~unlabeled_mask

    def fit(self, labels, max_iter, tol, verbose=True):
        """Fits a semi-supervised learning label propagation model.
        
        labels: torch.LongTensor
            Tensor of size n_nodes indicating the class number of each node.
            Unlabeled nodes are denoted with -1.
        max_iter: int
            Maximum number of iterations allowed.
        tol: float
            Convergence tolerance: threshold to consider the system at steady state.
        """
        self._one_hot_encode(labels)

        self.predictions = self.one_hot_labels.clone()
        prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float, device=labels.device)

        for i in range(max_iter):
            # Stop iterations if the system is considered at a steady state
            variation = torch.abs(self.predictions - prev_predictions).sum().item()
            
            if variation < tol:
                if verbose:
                    print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
                break

            prev_predictions = self.predictions.clone()
            self._propagate()

    def predict(self):
        return self.predictions

    def predict_classes(self):
        return self.predictions.max(dim=1).indices

In [4]:
class LabelSpreading(BaseLabelPropagation):
    def __init__(self, adj_matrix):
        super().__init__(adj_matrix)
        self.alpha = None

    @staticmethod
    def _normalize(adj_matrix):
        """Computes D^-1/2 * W * D^-1/2"""
        degs = adj_matrix.sum(dim=1)
        norm = torch.pow(degs, -0.5)
        norm[torch.isinf(norm)] = 1
        return adj_matrix * norm[:, None] * norm[None, :]

    def _propagate(self):
        self.predictions = (
            self.alpha * torch.matmul(self.norm_adj_matrix, self.predictions)
            + (1 - self.alpha) * self.one_hot_labels
        )
        #self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
    
    def fit(self, labels, max_iter=1000, tol=1e-3, alpha=0.5, verbose=True):
        """
        Parameters
        ----------
        alpha: float
            Clamping factor.
        """
        self.alpha = alpha
        super().fit(labels, max_iter, tol, verbose)

In [5]:
num_users = int(0.25 * D.shape[0])
num_users

6861

In [6]:
from joblib import Parallel, delayed

@jit(nopython=True)
def process_array(W):
    row_sums = W.sum(axis=1)
    W /= row_sums[:, np.newaxis]

    n = W.shape[0]
    for i in range(n):
        for j in range(n):
            if i != j:
                W[i, j] = max(W[i, j], W[j, i])
        W[i, i] = 0.0

    return W

def generate_weight_vector(seed, n, p):
    G = nx.erdos_renyi_graph(n=n, p=p, directed=False, seed=seed)
    W = nx.to_numpy_array(G, dtype=np.float32)
    W = process_array(W)
    return W

def initialize_population(population_len, n, p=0.1, seed=42, n_jobs=-1):
    population = Parallel(n_jobs=n_jobs)(
        delayed(generate_weight_vector)(seed + i, n, p) for i in range(population_len)
    )
    return population

In [7]:
start = time.time()
population = initialize_population(population_len=10, n=num_users)
print(f'elapsed time for population init: {time.time() - start}')

elapsed time for population init: 26.614319562911987


In [8]:
for W in population[:5]:
    row_sums = np.sum(W, axis=1)
    max_sum = np.max(row_sums)
    min_sum = np.min(row_sums)
    print(f'max row sum: {max_sum}\t min row sum: {min_sum}\t is symetric: {np.array_equal(W, W.T)}', end='\t')
    print(f'non-zero percentage: {round(np.count_nonzero(W) / W.size * 100, 3)}')

max row sum: 1.1426074504852295	 min row sum: 0.9999999403953552	 is symetric: True	non-zero percentage: 10.007
max row sum: 1.1446914672851562	 min row sum: 0.9999999403953552	 is symetric: True	non-zero percentage: 9.991
max row sum: 1.1418321132659912	 min row sum: 0.9999999403953552	 is symetric: True	non-zero percentage: 9.999
max row sum: 1.1467807292938232	 min row sum: 0.9999998807907104	 is symetric: True	non-zero percentage: 10.014
max row sum: 1.136523723602295	 min row sum: 0.9999998807907104	 is symetric: True	non-zero percentage: 10.006


In [9]:
original_data = D.copy()[:num_users, :].T
original_data.shape

(18, 6861)

In [16]:
from sklearn.metrics import mean_absolute_error


def get_test_indices(data, test_size=0.3):
    test_indices = []

    for movie in data:
        non_zero_indices = np.nonzero(movie)[0]

        test_sample_size = int(test_size * len(non_zero_indices))

        if test_sample_size > 0:
            random_indices = np.random.choice(non_zero_indices, size=test_sample_size, replace=False)
        else:
            random_indices = []
        
        test_indices.append(random_indices)
    
    return test_indices


def run_label_spreading(W, original_data, predicted_data, test_indices, alpha=0.8, verbose=True):
    adj_matrix_t = torch.FloatTensor(W)

    for i in range(original_data.shape[0]):
        labels_t = torch.LongTensor( original_data[i, :W.shape[0]] )
        
        if len(test_indices):
            labels_t[test_indices] = 0
        
        labels_t[labels_t == 0] = -1
            
        label_spreading = LabelSpreading(adj_matrix_t)
        label_spreading.fit(labels_t, alpha=alpha, verbose=verbose)
        label_spreading_output_labels = label_spreading.predict_classes()

        predicted_data[i, :] = label_spreading_output_labels


def get_loss(original_data, predicted_data, test_indices):
    e = []

    for i in range(original_data.shape[0]):
        if len(test_indices):
            y_true = original_data[i, test_indices].astype(np.int8)
            y_pred = predicted_data[i, test_indices].astype(np.int8)
            mae = mean_absolute_error(y_true, y_pred)
            e.append(mae)
    
    return np.sum(e) / len(e) if len(e) > 0 else np.finfo(float).max


def evaluate_population(population, original_data, verbose=True):
    test_indices = get_test_indices(original_data)
    scores = [ [i, None] for i in range(len(population)) ] 

    for i, W in enumerate(population):
        print(f'individual: {i}')
        predicted_data = np.empty_like(original_data)
        run_label_spreading(W, original_data, predicted_data, test_indices[i], verbose=verbose)
        scores[i][1] = get_loss(original_data, predicted_data, test_indices[i])
    
    scores = sorted(scores, key=lambda x: x[1])
    return scores

In [11]:
start = time.time()

scores = evaluate_population(population, original_data, verbose=True)

print(f'elapsed time for evaluating population: {time.time() - start}')
print(scores)

0
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0001.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 0 iterations, variation=0.0000.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0001.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 5 iterations, variation=0.0009.
The method stopped after 5 iterations, variation=0.0009.
The method stopped after 5 iterations, variation=0.0006.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0001.
The method stopped after 6 iterations, variation=0.0002.
The method stopped after 6 iterations, variation=0.0001.
The method stopped after 6 it

In [14]:
valid_scores = [x for x in scores if x[1] != np.finfo(float).max]
average_loss = sum(x[1] for x in valid_scores) / len(valid_scores)
average_loss

6.694760102829769