# Prototype Dataset
This file generates the path-to-path dataset to train a prototype transformer

The input output sequence of a logo should consist of 270 Parameters
- DeepSVG Embedding (256)
- a0 Type "EOS" (1)
- a1 - a6 Type (6)
- a7 - a12 Param (6)
- a13 time offset from beginning of animation in seconds

Input Sequence
- In the input sequence, the last 13 Parameters are set to zero
- In a final model, these parameters are aimed to be left out (as context vector isn't based on that)

Output Sequence
- The Output sequence consists of the selected paths represented by their DeepSVG Embedding
- The Animation is represented through the type and the parameters.
- Each output sequence ends with an EOS Token, where the EOS-type in the one-hot-encoded part is set to 1.

## Import DeepSVG Embedding

In [None]:
import pickle

import torch

with open("data/embeddings/path_embedding.pkl", "rb") as f:
    deepsvg_embedding = pickle.load(f)

In [None]:
## duplicate for further use
embeddings = deepsvg_embedding.copy() #for output sequences
input_sequences = deepsvg_embedding

In [None]:
embeddings['animation_id'] = embeddings['animation_id'].astype(int)
embeddings.head()

## Train / Test Split

### Logo Dataset Analysis
Most logos have only a few paths
Some logos have over 100 paths to animate

In [None]:
# Might not execute
# input_sequences['filename'].value_counts().hist(bins=100)

### Stratify on Number of Paths in Logo

In [None]:
# Count occurrences
logos = input_sequences.groupby('filename').size().reset_index(name='count')

# To stratify later, categorize the sizes
bins = 20
quantiles = [(i+1)/bins for i in range(bins)]
print(quantiles)

percentiles = logos['count'].quantile(quantiles)
print(percentiles)

# Function to categorize based on quantiles
def categorize_count(count):
    for i, percentile in enumerate(percentiles):
        if count <= percentile:
            return f"Quantile {i}"
    return f"Quantile {len(quantiles)}"

# Apply the categorization function to create a new column
logos['count_category'] = logos['count'].apply(categorize_count)
logos['count_category'].value_counts()

### Train / Test split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the DataFrame into training and testing sets, stratifying on 'count_category'
logos_train, logos_test = train_test_split(logos, test_size=0.2, stratify=logos['count_category'], random_state=42)

The distributions of lenghts are now similar. Outliers with a long sequence are in both datasets

In [None]:
logos_train['count'].hist(bins=100)

In [None]:
logos_test['count'].hist(bins=100)

In [None]:
logos_test = logos_test['filename'].unique()
logos_train = logos_train['filename'].unique()
logos_test

### Discussion
- One might also stratify over used animation types.
- Are there enough good animations in the dataset for logos with lots of paths? -> Manual annotation

## Animation Dataset

In [None]:
import pickle
import pandas as pd

# Load the pickle file
with open("data/surrogate_model/animation_label.pkl", "rb") as f:
    imported_pickle = pickle.load(f)

# Create a DataFrame from the imported data
filtered_data = pd.DataFrame(imported_pickle, columns=["file", "animation_id", "model_output", "label"])

# Define the mapping for ratings
mapping_dict = {"Very Good": 6, "Good": 5, "Okay": 4, "Bad": 3, "Very Bad": 2, "no_rating": 1}

# Replace the 'label' column with the corresponding ratings using .map
filtered_data['rating'] = filtered_data['label'].map(mapping_dict)

# Extract unique logos by splitting 'file' with "_animation" and using .str.get(0)
#logos_unique = filtered_data['file'].str.split('_animation').str.get(0).unique()

In [None]:
#example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Some Statistics about the Animations we have here
Result: Many animations with only one animated path

In [None]:
grouped = filtered_data.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations initially")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

In [None]:
grouped = filtered_data.groupby('label').count()
grouped = grouped[['file']].reset_index()
grouped.columns = ['Rating', 'Count']
grouped['Percentage'] = (grouped['Count'] / grouped['Count'].sum()) * 100
grouped

### Investigate overall rating of animations

In [None]:
filtered_data['average_rating'] = filtered_data.groupby('file')['rating'].transform('mean')
filtered_data['average_rating'].hist(bins=20)

In [None]:
# example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Now pick good Animations only
Pick the following animations
- Best per logo (include as many different logos as possible)
- Add all GOOD animations (additionally add as many animations as possible) TODO: Implement later

In [None]:
filtered_data['include'] = 0

In [None]:
# use all animations with an average rating over 3
minimum_rating = 3
temp = filtered_data['average_rating'] > minimum_rating
filtered_data.loc[temp, 'include'] = 1

In [None]:
print(f"{filtered_data['include'].mean() * 100}% of the data is left out")

In [None]:
# go through each logo to find the best animation
for logo in logos:
    # make a data frame that contains all the animations of one logo
    temp = filtered_data[filtered_data["file"].str.contains(logo)]

    best_logo = temp[temp['average_rating'] == temp['average_rating'].max()]
    best_logo = best_logo['file'].unique()
    
    filtered_data.loc[filtered_data['file'].isin(best_logo), 'include'] = 1
    break

In [None]:
print(f"Now {filtered_data['include'].mean() * 100}% of the data is left out")

Note: No additional animations are selected

In [None]:
best_output = filtered_data[filtered_data['include'] == 1].copy()
best_output.drop(columns=['include'], inplace=True, axis=1)

In [None]:
# extract logo
pattern = r'(logo_\d+)'
best_output['filename'] = best_output['file'].str.extract(pattern)

### Numeration to keep order later (corresponds to start-time)

In [None]:
# Create a time_offset column filled with zeros
best_output['a13'] = 0.0

# Define animation start time offset in seconds from animation start
TIME_OFFSET_STEP = 0.25

# Initialize a dictionary to store numbering for each file
file_counts = {}

# Iterate through the "file" column and number each unique file
for index, row in best_output.iterrows():
    filename = row['file']
    if filename not in file_counts:
        file_counts[filename] = 1
    else:
        file_counts[filename] += 1
    best_output.at[index, 'a13'] = file_counts[filename] * TIME_OFFSET_STEP
best_output.head()

### Stat: Animated paths per logo

In [None]:
used_paths = best_output[["filename", "animation_id"]].drop_duplicates(
    subset = ['filename', 'animation_id'], keep = 'last').reset_index(drop = True)
used_paths.head()

In [None]:
used_paths.groupby(['filename']).count().hist()

### Some Statistics again


In [None]:
grouped = best_output.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations left over")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

# Build sequences

## Make List of all Data Samples

In [None]:
final_animations_index = best_output.groupby(['filename', 'file']).size().reset_index(name='animation_length')
final_animations_index.head()

In [None]:
logos.head()

In [None]:
final_animations_index = final_animations_index.merge(logos[["filename","count"]], on='filename', how='left')
final_animations_index

### Plot: Number of Paths vs Animation Length

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extract the two columns for the scatter plot
x = final_animations_index['count']
y = final_animations_index['animation_length']

# Create a scatter plot
plt.scatter(x, y)

# Add labels and a title
plt.xlabel('Number of Paths')
plt.ylabel('Animation Length')
plt.title('Scatter Plot')

# Show the plot
plt.show()

## Bucketing
For batching the data later on, build buckets to group animations with similar length characteristics

In [None]:
# from dataset_helper import generate_buckets_2D
# generate_buckets_2D(final_animations_index,
#                  'count',
#                  'animation_length',
#                  [0.2, 0.4, 0.6, 0.8, 1],
#                  [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

## Build Input Sequences Dictionary

In [None]:
def build_input_sequences_dict(sequences: pd.DataFrame, used_paths: pd.DataFrame, is_used_path_filtering = False):
    if is_used_path_filtering:
        initial_length = len(sequences)
        used_paths['animation_id'] = used_paths['animation_id'].astype(str)
        sequences = pd.merge(sequences, used_paths, on=['filename', 'animation_id']).copy()
        print(f"Filtering of input sequences on only used paths. Reduction from {initial_length} to {len(sequences)} paths.")

    dict = {}
    for logo in sequences["filename"].unique():
        dict[logo] = sequences[sequences["filename"]==logo]
    
    return dict

In [None]:
input_sequences_dict = build_input_sequences_dict(input_sequences, used_paths, is_used_path_filtering=True)
input_sequences_dict[logos_train[0]]

## Build Output Sequence (Embedded Paths with Animation Vector)

In [None]:
# Merge Dataframes
output_sequence = pd.merge(best_output, embeddings, on=['filename', 'animation_id'], how='inner')
animation_vectors = pd.DataFrame(output_sequence["model_output"].to_list(), columns=["a1","a2","a3","a4","a5","a6","a7","a8","a9","a10","a11","a12"])
output_sequence.drop(['animation_id', 'model_output', 'label', 'rating', 'average_rating'], inplace=True, axis=1)
output_sequence['a0'] = 0 # EOS Feature in One-hot Encoding
output_sequence = pd.merge(output_sequence, animation_vectors, left_index=True, right_index=True)

In [None]:
# Move column 'a13' to the back
column_to_move = 'a13'
new_order = [col for col in output_sequence.columns if col != column_to_move]
new_order.append(column_to_move)
output_sequence = output_sequence[new_order]

In [None]:
output_sequence.head()

## Main Iteration over all samples

In [None]:
from dataset_helper import generate_dataset

dataset = generate_dataset(final_animations_index,
                           input_sequences_dict,
                           output_sequence,
                           {"train": logos_train, "test": logos_test},
                           sequence_length_input=8, # low as filtered
                           sequence_length_output=15, # input length
                           )
# Executed in 1 minute with shortened padding ~Cornelius

## Result

In [None]:
torch.save(dataset["train"]["input"], 'data/prototype_dataset/train_sequence_input.pt')
torch.save(dataset["train"]["output"], 'data/prototype_dataset/train_sequence_output.pt')
torch.save(dataset["test"]["input"], 'data/prototype_dataset/test_sequence_input.pt')
torch.save(dataset["test"]["output"], 'data/prototype_dataset/test_sequence_output.pt')

In [None]:
print(dataset["train"]["input"].size())
print(dataset["train"]["output"].size())
print(dataset["test"]["input"].size())
print(dataset["test"]["output"].size())