In [1]:
import numpy as np
import pandas as pd

Function to generate the clique_size_array during training and validation.

Properties:
- Clique size array is defined based on probabilities of extraction of K;
- When a new instance of training starts, the smallest K value has at least a 0.25 probability of being extracted.

In [13]:
def generate_batch_clique_sizes(allowed_clique_sizes, batch_size):
    """
    Generate the clique sizes for each graph in the batch (based on the allowed clique size values).
    
    Parameters:
    allowed_clique_sizes (np.ndarray): Allowed clique size values.
    batch_size (int): Size of the batch to generate.
    
    Returns:
    np.ndarray: Array of generated clique sizes for each graph in the batch.
    """
    # TESTING INPUT VALUES:
    if not isinstance(allowed_clique_sizes, np.ndarray):
        raise ValueError("allowed_clique_sizes must be a numpy array")
    if not isinstance(batch_size, int):
        raise ValueError("batch_size must be an integer")
    # if more than one clique size value is allowed, checking that last value of array is the smallest one:
    if len(allowed_clique_sizes) > 1:
        if min(allowed_clique_sizes) != allowed_clique_sizes[-1]:
            raise ValueError(
                "the last provided clique size value is not the smallest one: something might be wrong with the curriculum training procedure"
            )
    # END TESTING INPUT VALUES
    
    # probability of single clique size value is < 0.25:
    if 1/len(allowed_clique_sizes) < 0.25:
        # - set minimum probability value for lowest clique size
        prob_lowest = 0.25  
        # - calculate probability for remaining values (easier versions):
        prob_easier = (1-0.25)/(len(allowed_clique_sizes) -1)
        # - define array of probabilities:
        allowed_clique_sizes_probs = np.full(len(allowed_clique_sizes) - 1, prob_easier)
        allowed_clique_sizes_probs = np.concatenate((allowed_clique_sizes_probs, [prob_lowest]))
    
    # probability of single clique size value is >= 0.25    
    else:
        # - simply define single prob value
        prob_each = 1/len(allowed_clique_sizes)
        allowed_clique_sizes_probs = np.full(len(allowed_clique_sizes), prob_each)
    
    # Normalize the probabilities to ensure they sum to 1 (in case of rounding errors)
    allowed_clique_sizes_probs /= np.sum(allowed_clique_sizes_probs)
    
    # Generate the clique size array
    batch_clique_sizes = np.random.choice(allowed_clique_sizes, batch_size, p=allowed_clique_sizes_probs)
    
    # Count the occurrences of each element in allowed_clique_sizes in batch_clique_sizes
    occurrences = pd.Series(batch_clique_sizes).value_counts()
    print("occurrences:", occurrences)
    
    return batch_clique_sizes

In [14]:
batch_size = 1000
clique_size_values = np.linspace(20, 0, 10).astype(int)
print("Full list of clique size values:", clique_size_values)
print("===============================================")

for i, clique_size in enumerate(clique_size_values):
    
    # define current list of allowed K values:    
    current_clique_size_list = clique_size_values[:i+1]
    print(current_clique_size_list)
    print("Data type of current_clique_size_list:", type(current_clique_size_list))
    
    # generate clique size array:
    clique_size_array = generate_batch_clique_sizes(current_clique_size_list, batch_size)
    
    print("-------------------------------------------------")

Full list of clique size values: [20 17 15 13 11  8  6  4  2  0]
[20]
Data type of current_clique_size_list: <class 'numpy.ndarray'>
occurrences: 20    1000
Name: count, dtype: int64
-------------------------------------------------
[20 17]
Data type of current_clique_size_list: <class 'numpy.ndarray'>
occurrences: 17    515
20    485
Name: count, dtype: int64
-------------------------------------------------
[20 17 15]
Data type of current_clique_size_list: <class 'numpy.ndarray'>
occurrences: 17    351
15    339
20    310
Name: count, dtype: int64
-------------------------------------------------
[20 17 15 13]
Data type of current_clique_size_list: <class 'numpy.ndarray'>
occurrences: 20    278
15    263
17    232
13    227
Name: count, dtype: int64
-------------------------------------------------
[20 17 15 13 11]
Data type of current_clique_size_list: <class 'numpy.ndarray'>
occurrences: 11    249
13    208
17    183
20    183
15    177
Name: count, dtype: int64
-------------------

Testing function for validation case:

In [15]:
clique_sizes = np.array([20])

# Generate the clique sizes for the batch
batch_clique_sizes = generate_batch_clique_sizes(clique_sizes, batch_size)

occurrences: 20    1000
Name: count, dtype: int64
