In [14]:
import numpy as np

def generate_balanced_multiclass_logistic_data(n_samples, n_classes, d_dimensions, max_deviation=1):
    """
    Generates a balanced dataset for a multiclass logistic regression problem, perfectly classified by integer weights,
    with both features and weights as positive integers. Each class is equally represented in the dataset, and labels
    are assigned based on the weight matrix W.
    
    Parameters:
    - n_samples: int, total number of samples to generate
    - n_classes: int, number of classes
    - d_dimensions: int, number of dimensions
    - max_deviation: int, maximum allowed deviation for near-uniqueness in feature space
    
    Returns:
    - X: np.array, shape (n_samples, d_dimensions), the feature matrix with positive integer coordinates
    - y: np.array, shape (n_samples,), the labels
    - W: np.array, shape (n_classes, d_dimensions), the integer weights used to generate data
    """
    #np.random.seed(0)  # For reproducibility
    
    # Generate random integer weights for each class, in range [0, 18]
    allowed_Tries=n_samples*100
    samples_per_class = n_samples // n_classes
    extra_samples = n_samples % n_classes  # Any remainder will add one extra sample to some classes
    
    while True:
        actual_Tries=0
        print("hey")
        W = np.random.randint(0, 19, size=(n_classes, d_dimensions))
        
        # Calculate the number of samples per class
    
        X_list = []
        y_list = []
    
        for class_idx in range(n_classes):
            if actual_Tries>allowed_Tries:
                break
            class_samples = []
            class_labels = []
    
            # Generate enough samples for this class
            n_class_samples = samples_per_class + (1 if class_idx < extra_samples else 0)
            
            while len(class_samples) < n_class_samples:                
                if actual_Tries>allowed_Tries:
                    break
                actual_Tries+=1
                #print(actual_Tries,allowed_Tries)
                # Generate a random feature vector with integer values
                X_sample = np.random.randint(1, 20, size=(1, d_dimensions))
                
                # Compute the logits for this sample using the weight matrix W
                logits = X_sample @ W.T  # Shape: (1, n_classes)
                
                # Assign the label based on the class with the highest logit
                predicted_class = np.argmax(logits)
                
                # If the predicted class matches the target class, keep the sample
                if predicted_class == class_idx:
                    # Perturb slightly to prevent ambiguity in weight solutions
                    #X_sample += np.random.randint(-max_deviation, max_deviation + 1, size=(1, d_dimensions))
                    #X_sample = np.clip(X_sample, 1, 100)  # Keep values positive
    
                    class_samples.append(X_sample.flatten())
                    class_labels.append(class_idx)
    
            # Append to the main lists
            X_list.extend(class_samples)
            y_list.extend(class_labels)

        if actual_Tries<=allowed_Tries:
            # Convert to numpy arrays
            X = np.array(X_list)
            y = np.array(y_list)
        
            # Shuffle the dataset to avoid any class order patterns
            #indices = np.arange(n_samples)
            #np.random.shuffle(indices)
            #X, y = X[indices], y[indices]
        
            return X, y, W

# Example usage
n_samples = 100
n_classes = 3
d_dimensions = 4
max_deviation = 1  # Small scope for weight deviation

X, y, true_weights = generate_balanced_multiclass_logistic_data(n_samples, n_classes, d_dimensions, max_deviation)

print("Generated feature matrix X:\n", X[:5])  # Print first 5 samples for inspection
print("Generated labels y:\n", y)          # Print first 5 labels for inspection
print("True logistic regression weights (W):\n", true_weights)

# Check class balance
unique, counts = np.unique(y, return_counts=True)
print("Class distribution:", dict(zip(unique, counts)))


hey
Generated feature matrix X:
 [[12  9 11 10]
 [ 1  7  4 11]
 [ 2 11 11 17]
 [ 3  8 18 10]
 [ 3  3  5  9]]
Generated labels y:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
True logistic regression weights (W):
 [[ 4  7  7 17]
 [11 17  4  1]
 [ 5  2  1 18]]
Class distribution: {0: 34, 1: 33, 2: 33}


In [2]:
import LLM_Tasks
from transformers import AutoTokenizer

model_id = "Local-Meta-Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("hey")
ac_task=LLM_Tasks.Multiclass_Logistic_Regression_Task(tokenizer,Display_Context=True)
ac_task.Generate_Task()

hey


('The output represents the result of this linear classification given 2 dimensions and 3 classes: \n\ninput = ( 0 , 16 ) ; output = 1 \ninput = ( 14 , 13 ) ; output = 0 \ninput = ( 13 , 17 ) ; output = 0 \ninput = ( 2 , 6 ) ; output = 1 \ninput = ( 2 , 0 ) ; output = 2 \ninput = ( 7 , 2 ) ; output = 2 \ninput = ( 15 , 17 ) ; output = 0 \ninput = ( 10 , 9 ) ; output = 0 \ninput = ( 1 , 16 ) ; output = 1 \ninput = ( 6 , 1 ) ; output = 2 \ninput = ( 11 , 2 ) ; output = 2 \ninput = ( 13 , 2 ) ; output = 2 \ninput = ( 7 , 2 ) ; output = 2 \ninput = ( 2 , 6 ) ; output = 1 \ninput = ( 8 , 0 ) ; output = ',
 '2',
 [array([[ 5, 12],
         [ 2, 14],
         [11,  0]]),
  array([[192, 224,   0],
         [226, 210, 154],
         [269, 264, 143],
         [ 82,  88,  22],
         [ 10,   4,  22],
         [ 59,  42,  77],
         [279, 268, 165],
         [158, 146, 110],
         [197, 226,  11],
         [ 42,  26,  66],
         [ 79,  50, 121],
         [ 89,  54, 143],
         [ 59, 

In [7]:
import torch
import torch.nn.functional as F

def attention_weighted_sum(values, attention, head_dim, output_dim=256):
    """
    Function to project heads, apply softmax to attention, and compute weighted sum.
    
    Parameters:
    values (torch.Tensor): Tensor of shape (tokens, heads * head_dim), representing values from different heads.
    attention (torch.Tensor): Tensor of shape (tokens, heads), representing attention weights for each head.
    head_dim (int): Dimension of each head before projection.
    output_dim (int): Dimension to project each head to (default is 256).
    
    Returns:
    torch.Tensor: Tensor of shape (tokens, output_dim) representing the weighted sum of projected heads.
    """
    tokens, total_dim = values.shape
    heads = total_dim // head_dim
    
    # Reshape values to separate heads
    values = values.view(tokens, heads, head_dim)  # (tokens, heads, head_dim)

    # Define the projection matrix, shared across heads, to project each head to output_dim
    projection_matrix = torch.randn(head_dim, output_dim)  # Shape: (head_dim, output_dim)

    # Project each head to output_dim
    print(values)
    print(projection_matrix)
    projected_values = torch.einsum('thd,do->tho', values, projection_matrix)  # Shape: (tokens, heads, output_dim)
    print(projected_values)
    
    # Apply softmax on the attention matrix such that all elements sum to 1
    attention_weights = F.softmax(attention.view(-1), dim=0).view(tokens, heads)  # Shape: (tokens, heads)

    # Use attention weights to perform weighted sum on the projected values
    weighted_sum = torch.einsum('th,tho->to', attention_weights, projected_values)  # Shape: (tokens, output_dim)

    return weighted_sum  # Shape: (tokens, output_dim)

# Example usage
tokens = 1
heads = 2
head_dim = 3
values = torch.randn(tokens, heads * head_dim)
attention = torch.randn(tokens, heads)

output = attention_weighted_sum(values, attention, head_dim,output_dim=4)
print(output.shape)  # Should print torch.Size([tokens, 256])


tensor([[[-0.3794, -1.1828, -0.2481],
         [-1.2955, -1.8468,  0.0469]]])
tensor([[-1.4528, -0.6071, -1.0515, -0.9251],
        [-0.0682,  0.6161,  0.8818, -0.7948],
        [ 0.7523, -1.1920,  1.7286,  1.1223]])
tensor([[[ 0.4453, -0.2027, -1.0730,  1.0127],
         [ 2.0435, -0.4072, -0.1853,  2.7191]]])
torch.Size([1, 4])


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomAttention(nn.Module):
    def __init__(self, heads, head_dim, hidden_dim_project=256, hidden_dim_layer=256, output_dim=10, num_layers=1):
        """
        Initialize the CustomAttention module.

        Parameters:
        heads (int): Number of attention heads.
        head_dim (int): Dimension of each attention head.
        output_dim (int): Dimension to project each head to (default is 256).
        layer_config (str): Either "linear" for a single output layer or "mlp" for a multi-layer MLP.
        num_layers (int): Number of layers for the output MLP (only applies if layer_config="mlp").
        """
        super(CustomAttention, self).__init__()
        self.heads = heads
        self.head_dim = head_dim
        self.output_dim = output_dim

        # Learnable projection matrix (shared across heads)
        self.projection_matrix = nn.Parameter(torch.randn(head_dim, hidden_dim_project))

        # Learnable attention matrix
        self.attention_matrix = nn.Parameter(torch.randn(heads))

        # Define the output layers based on the layer_config
        self.num_layers = num_layers
        self.output_layers = nn.ModuleList()


        last_dim=hidden_dim_project
        for i in range(num_layers-1):
            new_dim=hidden_dim_layer
            self.output_layers.append(nn.Linear(last_dim, new_dim))
            self.output_layers.append(nn.GELU())
            last_dim=new_dim
        self.output_layers.append(nn.Linear(last_dim, output_dim))

    def forward(self, values):
        """
        Forward pass through the CustomAttention module.

        Parameters:
        values (torch.Tensor): Input tensor of shape (tokens, heads * head_dim).
        
        Returns:
        torch.Tensor: Output tensor of shape (tokens, output_dim).
        """
        tokens, total_dim = values.shape
        assert total_dim == self.heads * self.head_dim, \
            f"Expected values to have shape (tokens, {self.heads * self.head_dim}), but got {values.shape}"

        # Reshape to (tokens, heads, head_dim) to separate heads
        values = values.view(tokens, self.heads, self.head_dim)

        # Project each head to output_dim using the learnable projection_matrix
        projected_values = torch.einsum('thd,do->tho', values, self.projection_matrix)  # (tokens, heads, output_dim)

        # Apply softmax on the learnable attention matrix (global normalization across tokens and heads)
        attention_weights = F.softmax(self.attention_matrix, dim=0)  # Shape: (heads,)
        
        # Reshape attention weights to (1, heads) for broadcasting with projected_values
        attention_weights = attention_weights.unsqueeze(0)  # Shape: (1, heads)
        
        # Use attention weights to perform weighted sum on the projected values
        weighted_sum = torch.einsum('th,tho->to', attention_weights, projected_values)  # Shape: (tokens, output_dim)

        # Apply the output layer(s) based on the layer configuration
        output = weighted_sum
        for layer in self.output_layers:
            output = layer(output)

        return output

# Example usage
tokens = 10
heads = 8
head_dim = 64
output_dim = 256

# Define input values tensor of shape (tokens, heads * head_dim)
values = torch.randn(tokens, heads * head_dim)

# Initialize the CustomAttention module with MLP output configuration
custom_attention = CustomAttention(heads=heads, head_dim=head_dim, output_dim=output_dim, num_layers=2)

# Forward pass
output = custom_attention(values)
print(output.shape)  # Should print torch.Size([tokens, 256])


torch.Size([10, 256])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomAttention(nn.Module):
    def __init__(self, heads, head_dim, hidden_dim_project=256, hidden_dim_layer=256, output_dim=10, num_layers=1):
        """
        Initialize the CustomAttention module.

        Parameters:
        heads (int): Number of attention heads.
        head_dim (int): Dimension of each attention head.
        output_dim (int): Dimension to project each head to (default is 256).
        num_layers (int): Number of layers for the output MLP (only applies if layer_config="mlp").
        """
        super(CustomAttention, self).__init__()
        self.heads = heads
        self.head_dim = head_dim
        self.output_dim = output_dim

        # Learnable projection matrix (shared across heads)
        self.projection_matrix = nn.Parameter(torch.randn(head_dim, hidden_dim_project))

        # Learnable attention matrix
        self.attention_matrix = nn.Parameter(torch.randn(heads))

        # Define the output layers based on the layer_config
        self.num_layers = num_layers
        self.output_layers = nn.ModuleList()

        last_dim = hidden_dim_project
        for i in range(num_layers - 1):
            new_dim = hidden_dim_layer
            self.output_layers.append(nn.Linear(last_dim, new_dim))
            self.output_layers.append(nn.GELU())
            last_dim = new_dim
        self.output_layers.append(nn.Linear(last_dim, output_dim))

    def forward(self, values):
        """
        Forward pass through the CustomAttention module.

        Parameters:
        values (torch.Tensor): Input tensor of shape (tokens, heads * head_dim).
        
        Returns:
        torch.Tensor: Output tensor of shape (output_dim).
        """
        tokens, total_dim = values.shape
        assert total_dim == self.heads * self.head_dim, \
            f"Expected values to have shape (tokens, {self.heads * self.head_dim}), but got {values.shape}"

        # Reshape to (tokens, heads, head_dim) to separate heads
        values = values.view(tokens, self.heads, self.head_dim)

        # Project each head to hidden_dim_project using the learnable projection_matrix
        projected_values = torch.einsum('thd,do->tho', values, self.projection_matrix)  # Shape: (tokens, heads, hidden_dim_project)

        # Apply softmax on the learnable attention matrix (global normalization across heads)
        attention_weights = F.softmax(self.attention_matrix, dim=0)  # Shape: (heads,)
        
        # Reshape attention weights to (1, heads) for broadcasting with projected_values
        attention_weights = attention_weights.unsqueeze(0)  # Shape: (1, heads)
        
        # Use attention weights to perform weighted sum on the projected values
        weighted_sum = torch.einsum('th,tho->to', attention_weights, projected_values)  # Shape: (tokens, hidden_dim_project)

        # Aggregate over the tokens by summing them up
        aggregated_tokens = weighted_sum.sum(dim=0)  # Shape: (hidden_dim_project)

        # Pass through the output layers
        output = aggregated_tokens
        for layer in self.output_layers:
            output = layer(output)

        return output

# Example usage
tokens = 10
heads = 8
head_dim = 64
output_dim = 256

# Define input values tensor of shape (tokens, heads * head_dim)
values = torch.randn(tokens, heads * head_dim)

# Initialize the CustomAttention module with MLP output configuration
custom_attention = CustomAttention(heads=heads, head_dim=head_dim, output_dim=output_dim, num_layers=2)

# Forward pass
output = custom_attention(values)
print(output.shape)  # Should print torch.Size([256])


torch.Size([256])


In [12]:
# First extract relevance map 

In [13]:
# Then test if the ones with the highest difference are representative of the respective 


# 3 step strategy:
## test out with the one in the paper how much of it is saved in that layer :)
## Test out how the performance is if we only look at the most important vs the least important for one task perform (given the difference mapping I get of the feature importance map)

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomAttention(nn.Module):
    def __init__(self, input_dim, output_dim, max_tokens, projection_dim=512, hidden_dim=512, num_layers=1):
        """
        Initialize the CustomAttention module.

        Parameters:
        output_dim (int): Dimension of the input tensor per token.
        projection_dim (int): Dimension to project each token to (default is 512).
        max_tokens (int): Maximum number of tokens expected in the input.
        num_layers (int): Number of layers for the output MLP.
        """
        super(CustomAttention, self).__init__()

        self.input_dim=input_dim
        self.max_tokens=max_tokens
        # Learnable initial projection layer (output_dim -> projection_dim)
        self.initial_projection = nn.Linear(input_dim, projection_dim)

        # Learnable attention vector of length max_tokens
        self.attention_vector = nn.Parameter(torch.randn(max_tokens))

        # Output MLP layers
        self.output_layers = nn.ModuleList()
        last_dim = projection_dim
        for _ in range(num_layers - 1):
            new_dim=hidden_dim
            self.output_layers.append(nn.Linear(last_dim, new_dim))
            self.output_layers.append(nn.GELU())
            last_dim = new_dim
        self.output_layers.append(nn.Linear(last_dim, output_dim))

    def forward(self, values):
        """
        Forward pass through the CustomAttention module.

        Parameters:
        values (torch.Tensor): Input tensor of shape (tokens, output_dim).
        
        Returns:
        torch.Tensor: Output tensor of shape (projection_dim).
        """
        tokens, in_dim = values.shape
        assert in_dim == self.input_dim, \
            f"Expected values to have shape (tokens, {self.output_dim}), but got {values.shape}"
        assert tokens <= self.max_tokens, \
            f"Number of tokens ({tokens}) exceeds max_tokens ({self.max_tokens})"

        # Project each token from output_dim to projection_dim
        values = self.initial_projection(values)  # Shape: (tokens, projection_dim)

        # Apply softmax on the learnable attention vector (up to `tokens` entries)
        attention_weights = F.softmax(self.attention_vector[:tokens], dim=0)  # Shape: (tokens,)

        # Perform weighted sum across tokens
        weighted_sum = (attention_weights.unsqueeze(1) * values).sum(dim=0)  # Shape: (projection_dim)
        
        # Pass through the output layers
        output = weighted_sum
        for layer in self.output_layers:
            output = layer(output)

        return output

# Example usage
tokens = 10
input_dim = 256
projection_dim = 512
max_tokens = 10

# Define input values tensor of shape (tokens, output_dim)
values = torch.randn(tokens, input_dim)

# Initialize the CustomAttention module
custom_attention = CustomAttention(input_dim=input_dim,output_dim=10,max_tokens=100, projection_dim=projection_dim, num_layers=2)

# Forward pass
output = custom_attention(values)
print(output.shape)  # Should print torch.Size([512])

torch.Size([10])


In [24]:
import numpy as np

def selective_keep(Input_Arr, Masking_Arr, keep):
    # Get the number of dimensions
    input_dim = Masking_Arr.shape[0]
    
    # Calculate the number of top/bottom indices to keep
    keep_count = int(np.ceil(keep * input_dim))

    # Get the indices of dimensions with highest and lowest values in Masking_Arr
    sorted_indices = np.argsort(Masking_Arr)  # Sort Masking_Arr to find the min and max indices
    print(sorted_indices)
    top_indices = sorted_indices[-keep_count:]  # Top 'keep_count' indices (highest Masking_Arr values)
    bottom_indices = sorted_indices[:keep_count]  # Bottom 'keep_count' indices (lowest Masking_Arr values)

    # Select the top and bottom values from Input_Arr for each token
    top_values = Input_Arr[:, top_indices]
    bottom_values = Input_Arr[:, bottom_indices]

    return top_values, bottom_values


# Example input
Masking_Arr = np.array([0.2, 0.8, 0.5, 0.9, 0.3])
Input_Arr = np.array([
    [10, 20, 30, 40, 50],
    [15, 25, 35, 45, 55],
    [12, 22, 32, 42, 52]
])
keep = 0.4  # Keep 40% of dimensions

# Call the function
top_values, bottom_values = selective_keep(Input_Arr, Masking_Arr, keep)

print("Top values:\n", top_values)
print("Bottom values:\n", bottom_values)


[0 4 2 1 3]
Top values:
 [[20 40]
 [25 45]
 [22 42]]
Bottom values:
 [[10 50]
 [15 55]
 [12 52]]


In [1]:
list1 = [[1, 2, 3], [4, 5, 6]]
list2 = [[1, 2, 3], [4, 5, 6]]

if list1 == list2:
    print("The multidimensional lists are the same")
else:
    print("The multidimensional lists are different")

The multidimensional lists are the same
