# Prototype Dataset
This file generates the path-to-path dataset to train a prototype transformer

The input output sequence of a logo should consist of 269 Parameters
- DeepSVG Embedding (256)
- Type "EOS" (1) (open for discussion, but it makes the most sense in the predicting process ~Corni)
- Type (6)
- Param (6)

Input Sequence
- In the input sequence, the last 13 Parameters are set to zero
- In a final model, these parameters are aimed to be left out (as context vector isn't based on that)

Output Sequence
- The Output sequence consists of the selected paths represented by their DeepSVG Embedding
- The Animation is represented through the type and the parameters.
- Each output sequence ends with an EOS Token, where the EOS-type in the one-hot-encoded part is set to 1.

## Import DeepSVG Embedding

In [None]:
import pickle

import torch

with open("data/embeddings/path_embedding.pkl", "rb") as f:
    deepsvg_embedding = pickle.load(f)

In [None]:
## duplicate for further use
embeddings = deepsvg_embedding.copy() #for output sequences
input_sequences = deepsvg_embedding

In [None]:
embeddings['animation_id'] = embeddings['animation_id'].astype(int)
embeddings.head()

## Train / Test Split

### Logo Dataset Analysis
Most logos have only a few paths
Some logos have over 100 paths to animate

In [None]:
input_sequences['filename'].value_counts().hist(bins=100)

### Stratify on Number of Paths in Logo

In [None]:
# Count occurrences
logos = input_sequences.groupby('filename').size().reset_index(name='count')

# To stratify later, categorize the sizes
bins = 20
quantiles = [(i+1)/bins for i in range(bins)]
print(quantiles)

percentiles = logos['count'].quantile(quantiles)
print(percentiles)

# Function to categorize based on quantiles
def categorize_count(count):
    for i, percentile in enumerate(percentiles):
        if count <= percentile:
            return f"Quantile {i}"
    return f"Quantile {len(quantiles)}"

# Apply the categorization function to create a new column
logos['count_category'] = logos['count'].apply(categorize_count)
logos['count_category'].value_counts()

### Train / Test split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the DataFrame into training and testing sets, stratifying on 'count_category'
logos_train, logos_test = train_test_split(logos, test_size=0.2, stratify=logos['count_category'], random_state=42)

The distributions of lenghts are now similar. Outliers with a long sequence are in both datasets

In [None]:
logos_train['count'].hist(bins=100)

In [None]:
logos_test['count'].hist(bins=100)

In [None]:
logos_test = logos_test['filename'].unique()
logos_train = logos_train['filename'].unique()
logos_test

### Discussion
- One might also stratify over used animation types.
- Are there enough good animations in the dataset for logos with lots of paths? -> Manual annotation

## Animation Dataset

In [None]:
import pickle
import pandas as pd

# Load the pickle file
with open("data/surrogate_model/animation_label.pkl", "rb") as f:
    imported_pickle = pickle.load(f)

# Create a DataFrame from the imported data
filtered_data = pd.DataFrame(imported_pickle, columns=["file", "animation_id", "model_output", "label"])

# Define the mapping for ratings
mapping_dict = {"Very Good": 6, "Good": 5, "Okay": 4, "Bad": 3, "Very Bad": 2, "no_rating": 1}

# Replace the 'label' column with the corresponding ratings using .map
filtered_data['rating'] = filtered_data['label'].map(mapping_dict)

# Extract unique logos by splitting 'file' with "_animation" and using .str.get(0)
#logos_unique = filtered_data['file'].str.split('_animation').str.get(0).unique()

In [None]:
#example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Some Statistics about the Animations we have here
Result: Many animations with only one animated path

In [None]:
grouped = filtered_data.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations initially")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

In [None]:
grouped = filtered_data.groupby('label').count()
grouped = grouped[['file']].reset_index()
grouped.columns = ['Rating', 'Count']
grouped['Percentage'] = (grouped['Count'] / grouped['Count'].sum()) * 100
grouped

### Investigate overall rating of animations

In [None]:
filtered_data['average_rating'] = filtered_data.groupby('file')['rating'].transform('mean')
filtered_data['average_rating'].hist(bins=20)

In [None]:
# example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Now pick good Animations only
Pick the following animations
- Best per logo (include as many different logos as possible)
- Add all GOOD animations (additionally add as many animations as possible) TODO: Implement later

In [None]:
filtered_data['include'] = 0

In [None]:
# use all animations with an average rating over 3
minimum_rating = 3
temp = filtered_data['average_rating'] > minimum_rating
filtered_data.loc[temp, 'include'] = 1

In [None]:
print(f"{filtered_data['include'].mean() * 100}% of the data is left out")

In [None]:
# go through each logo to find the best animation
for logo in logos:
    # make a data frame that contains all the animations of one logo
    temp = filtered_data[filtered_data["file"].str.contains(logo)]

    best_logo = temp[temp['average_rating'] == temp['average_rating'].max()]
    best_logo = best_logo['file'].unique()
    
    filtered_data.loc[filtered_data['file'].isin(best_logo), 'include'] = 1
    break

In [None]:
print(f"Now {filtered_data['include'].mean() * 100}% of the data is left out")

Note: No additional animations are selected

In [None]:
best_output = filtered_data[filtered_data['include'] == 1].copy()
best_output.drop(columns=['include'], inplace=True, axis=1)

In [None]:
# extract logo
pattern = r'(logo_\d+)'
best_output['filename'] = best_output['file'].str.extract(pattern)

### Numeration to keep order later (corresponds to start-time)

In [None]:
# Create an empty "count" column filled with zeros
best_output['order'] = 0

# Initialize a dictionary to store numbering for each file
file_counts = {}

# Iterate through the "file" column and number each unique file
for index, row in best_output.iterrows():
    filename = row['file']
    if filename not in file_counts:
        file_counts[filename] = 1
    else:
        file_counts[filename] += 1
    best_output.at[index, 'order'] = file_counts[filename]
best_output.head()

### Some Statistics again


In [None]:
grouped = best_output.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations left over")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

# Build sequences

## Make List of all Data Samples

In [None]:
final_animations_index = best_output.groupby(['filename', 'file']).size().reset_index(name='animation_length')
final_animations_index.head()

In [None]:
logos.head()

In [None]:
final_animations_index = final_animations_index.merge(logos[["filename","count"]], on='filename', how='left')
final_animations_index

### Plot: Number of Paths vs Animation Length

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extract the two columns for the scatter plot
x = final_animations_index['count']
y = final_animations_index['animation_length']

# Create a scatter plot
plt.scatter(x, y)

# Add labels and a title
plt.xlabel('Number of Paths')
plt.ylabel('Animation Length')
plt.title('Scatter Plot')

# Show the plot
plt.show()

## Build Input Sequences Dictionary

In [None]:
input_sequences_dict = {}
for logo in input_sequences["filename"].unique():
    input_sequences_dict[logo] = input_sequences[input_sequences["filename"]==logo]

In [None]:
# example
print(logos_train[0])
input_sequences_dict[logos_train[0]]

## Build Output Sequence (Embedded Paths with Animation Vector)

In [None]:
# Merge Dataframes
output_sequence = pd.merge(best_output, embeddings, on=['filename', 'animation_id'], how='inner')
animation_vectors = pd.DataFrame(output_sequence["model_output"].to_list(), columns=["a1","a2","a3","a4","a5","a6","a7","a8","a9","a10","a11","a12"])
output_sequence.drop(['animation_id', 'model_output', 'label', 'rating', 'average_rating'], inplace=True, axis=1)
output_sequence['a0'] = 0 # EOS Feature in One-hot Encoding
output_sequence = pd.merge(output_sequence, animation_vectors, left_index=True, right_index=True)

In [None]:
output_sequence.head()

## Main Iteration over all samples

In [None]:
PADDING_VALUE = float('-inf')

def generate_input_sequence(logo_embeddings: pd.DataFrame, null_features: int, sequence_length : int, is_randomized : bool) -> torch.Tensor:
    """
    Build a torch tensor for the transformer input sequences.
    Includes
    - Randomization (optional)
    - Generation of padding
    
    Args:
        is_randomized: shuffle order of paths
        logo_embeddings (pd.DataFrame): DataFrame containing logo embeddings.
        null_features (int): Number of null features to add to each embedding.
        sequence_length (int): Target length for padding sequences.

    Returns:
        torch.Tensor: Tensor representing the input sequences.
    """
    logo_embeddings.drop(columns=['filename', 'animation_id'], inplace=True)
    
    # Randomization
    if is_randomized:
        logo_embeddings = logo_embeddings.sample(frac=1).reset_index(drop=True)
    
    # Null Features
    if null_features > 0:
        logo_embeddings = pd.concat([logo_embeddings,
                                     pd.DataFrame(0,
                                                  index=logo_embeddings.index,
                                                  columns=range(logo_embeddings.shape[1],
                                                                logo_embeddings.shape[1] + null_features))],
                                    axis=1,
                                    ignore_index=True)
    
    # Padding Generation: Add padding rows or cut off excess rows
    if len(logo_embeddings) < sequence_length:
        padding_rows = pd.DataFrame([[PADDING_VALUE] * len(logo_embeddings.columns)] * (sequence_length - len(logo_embeddings)),
                                    columns=logo_embeddings.columns)
        logo_embeddings = pd.concat([logo_embeddings, padding_rows], ignore_index=True)
    elif len(logo_embeddings) > sequence_length:
        # Cut off excess rows
        logo_embeddings = logo_embeddings.iloc[:sequence_length]
    
    return torch.tensor(logo_embeddings.values)

In [None]:
EOS_TOKEN = float('-inf')
def generate_output_sequence(animation: pd.DataFrame, sequence_length: int, is_randomized : bool) -> torch.Tensor:
    """
    Build a torch tensor for the transformer output sequences.
    Includes
    - Randomization (later, when same start time)
    - Generation of padding
    - Add EOS Token

    Args:
        is_randomized: shuffle order of paths, applies when same start time
        animation (pd.DataFrame): DataFrame containing logo embeddings.
        sequence_length (int): Target length for padding sequences.

    Returns:
        torch.Tensor: Tensor representing the input sequences.
    """
    if is_randomized:
        animation = animation.sample(frac=1).reset_index(drop=True)
        print("Note: Randomization not implemented yet")
    
    animation.sort_values(by=['order'], inplace=True) # again ordered by order or time start.
    animation.drop(columns=['file', 'filename', 'order'], inplace=True)
    
    # Append the EOS row to the DataFrame
    sos_eos_row = {col: 0 for col in animation.columns}
    sos_eos_row["a0"] = 1
    animation = pd.concat([pd.DataFrame([sos_eos_row]),
                           animation,
                           pd.DataFrame([sos_eos_row])], ignore_index=True)
    
    # Padding Generation: Add padding rows or cut off excess rows
    if len(animation) < sequence_length:
        padding_rows = pd.DataFrame([[PADDING_VALUE] * len(animation.columns)] * (sequence_length - len(animation)),
                                    columns=animation.columns)
        animation = pd.concat([animation, padding_rows], ignore_index=True)
    elif len(animation) > sequence_length:
        # Cut off excess rows
        animation = animation.iloc[:sequence_length]

    return torch.Tensor(animation.values)

In [None]:
train_sequence_input_list = []
train_sequence_output_list = []
test_sequence_input_list = []
test_sequence_output_list = []

for i, logo_info in final_animations_index.iterrows():
    logo = logo_info['filename']     # e.g. logo_1
    file = logo_info['file']             # e.g. logo_1_animation_2
    print(f"Processing {logo} with {file}")
    
    input_tensor = generate_input_sequence(input_sequences_dict[logo].copy(),
                                           null_features=13, #TODO depends on architecture later
                                           sequence_length=128, #TODO design question: Max elements per Logo? 
                                           is_randomized=True)
    
    output_tensor = generate_output_sequence(output_sequence[(output_sequence['filename'] == logo) & (output_sequence['file'] == file)].copy(),
                                             sequence_length=15, #TODO Currently the max length of animations + 1 for EOS
                                             is_randomized=False)
    # append to lists
    if logo in logos_train:
        train_sequence_input_list.append(input_tensor)
        train_sequence_output_list.append(output_tensor)
        
    elif logo in logos_test:
        test_sequence_input_list.append(input_tensor)
        test_sequence_output_list.append(output_tensor)
        
    else:
        print(f"Some problem with {logo}. Neither in train or test set list.")
        
# Executed in 4 minutes ~Cornelius

## Result

In [None]:
train_sequence_input = torch.stack(train_sequence_input_list)
train_sequence_output = torch.stack(train_sequence_output_list)
test_sequence_input = torch.stack(test_sequence_input_list)
test_sequence_output = torch.stack(test_sequence_output_list)

In [None]:
print(train_sequence_input.shape)
print(train_sequence_output.shape)
print(test_sequence_input.shape)
print(test_sequence_output.shape)

In [None]:
torch.save(train_sequence_input, 'data/prototype_dataset/train_sequence_input.pt')
torch.save(train_sequence_output, 'data/prototype_dataset/train_sequence_output.pt')
torch.save(test_sequence_input, 'data/prototype_dataset/test_sequence_input.pt')
torch.save(test_sequence_output, 'data/prototype_dataset/test_sequence_output.pt')