# Prototype Dataset
This file generates the path-to-path dataset to train a prototype transformer

The input output sequence of a logo should consist of 269 Parameters
- DeepSVG Embedding (256)
- Type "EOS" (1) (open for discussion, but it makes the most sense in the predicting process ~Corni)
- Type (6)
- Param (6)

Input Sequence
- In the input sequence, the last 13 Parameters are set to zero
- In a final model, these parameters are aimed to be left out (as context vector isn't based on that)

Output Sequence
- The Output sequence consists of the selected paths represented by their DeepSVG Embedding
- The Animation is represented through the type and the parameters.
- Each output sequence ends with an EOS Token, where the EOS-type in the one-hot-encoded part is set to 1.

# Import DeepSVG Embedding

In [None]:
import pickle
with open("data/embeddings/path_embedding.pkl", "rb") as f:
    deepsvg_embedding = pickle.load(f)

In [None]:
## duplicate for further use
embeddings = deepsvg_embedding.copy() #for output sequences
input_sequences = deepsvg_embedding

In [None]:
embeddings['animation_id'] = embeddings['animation_id'].astype(int)
embeddings.head()

# Build Input Sequence (Embedded Logo Paths)

In [None]:
input_sequences

### Logo Dataset
Most logos have only a few paths
Some logos have over 100 paths to animate

In [None]:
input_sequences['filename'].value_counts().hist(bins=100)

In [None]:
input_sequences['filename'].value_counts()

In [None]:
# Count occurrences
logos = input_sequences.groupby('filename').size().reset_index(name='count')

# To stratify later, categorize the sizes

# Percentiles by Data
quantiles = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
percentiles = logos['count'].quantile(quantiles)
print(percentiles)

# Percentiles by Min / Max
quantiles2 = [(i+1)/len(quantiles) for i, _ in enumerate(quantiles)]
percentiles_mock = [x * (logos['count'].max()-logos['count'].min()) + logos['count'].min() for x in quantiles]
print(percentiles_mock)

# Mix with harmonic mean that emphasizes small values
def harmonic_mean(a, b):
    if a == 0 or b == 0:
        raise ValueError("Harmonic mean is undefined when one or both values are zero.")
    return 2 / ((1 / a) + (1 / b))

percentiles_mix = [harmonic_mean(percentile,percentiles_mock[i]) for i, percentile in enumerate(percentiles)]
print(percentiles_mix)

# Function to categorize based on quantiles
def categorize_count(count):
    for i, percentile in enumerate(percentiles_mix):
        if count <= percentile:
            return f"Quantile {i}"
    return f"Quantile {len(quantiles)}"

# Apply the categorization function to create a new column
logos['count_category'] = logos['count'].apply(categorize_count)
logos['count_category'].value_counts()

In [None]:
percentiles

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the DataFrame into training and testing sets, stratifying on 'count_category'
X_train, X_test = train_test_split(logos, test_size=0.2, stratify=logos['count_category'], random_state=42)

# Print the distribution of 'count_category' in the training and testing sets
train_counts = X_train['count_category'].value_counts()
test_counts = X_test['count_category'].value_counts()

print("Training set distribution:")
print(train_counts)

print("\nTesting set distribution:")
print(test_counts)

In [None]:
X_train['count'].hist(bins=50)

In [None]:
X_test['count'].hist(bins=50)

# Import Old Dataset

In [None]:
import pickle
import pandas as pd

# Load the pickle file
with open("data/surrogate_model/animation_label.pkl", "rb") as f:
    imported_pickle = pickle.load(f)

# Create a DataFrame from the imported data
filtered_data = pd.DataFrame(imported_pickle, columns=["file", "animation_id", "model_output", "label"])

# Define the mapping for ratings
mapping_dict = {"Very Good": 6, "Good": 5, "Okay": 4, "Bad": 3, "Very Bad": 2, "no_rating": 1}

# Replace the 'label' column with the corresponding ratings using .map
filtered_data['rating'] = filtered_data['label'].map(mapping_dict)

# Extract unique logos by splitting 'file' with "_animation" and using .str.get(0)
logos = filtered_data['file'].str.split('_animation').str.get(0).unique()

In [None]:
#example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Some Statistics about the Animations we have here
Result: Many animations with only one animated path

In [None]:
grouped = filtered_data.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations initially")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

In [None]:
grouped = filtered_data.groupby('label').count()
grouped = grouped[['file']].reset_index()
grouped.columns = ['Rating', 'Count']
grouped['Percentage'] = (grouped['Count'] / grouped['Count'].sum()) * 100
grouped

### Investigate overall rating of animations

In [None]:
filtered_data['average_rating'] = filtered_data.groupby('file')['rating'].transform('mean')
filtered_data['average_rating'].hist(bins=20)

In [None]:
# example
filtered_data[filtered_data["file"].str.contains("logo_5_")]

### Now pick good Animations only
Pick the following animations
- Best per logo (include as many different logos as possible)
- Add all GOOD animations (additionally add as many animations as possible) TODO: Implement later

In [None]:
filtered_data['include'] = 0

In [None]:
# use all animations with an average rating over 3
minimum_rating = 3
temp = filtered_data['average_rating'] > minimum_rating
filtered_data.loc[temp, 'include'] = 1

In [None]:
print(f"{filtered_data['include'].mean() * 100}% of the data is left out")

In [None]:
# go through each logo to find the best animation
for logo in logos:
    # make a data frame that contains all the animations of one logo
    temp = filtered_data[filtered_data["file"].str.contains(logo)]

    best_logo = temp[temp['average_rating'] == temp['average_rating'].max()]
    best_logo = best_logo['file'].unique()
    
    filtered_data.loc[filtered_data['file'].isin(best_logo), 'include'] = 1
    break

In [None]:
print(f"Now {filtered_data['include'].mean() * 100}% of the data is left out")

Note: No additional animations are selected

In [None]:
best_output = filtered_data[filtered_data['include'] == 1].copy()
best_output.drop(columns=['include'], inplace=True, axis=1)

In [None]:
# extract logo
pattern = r'(logo_\d+)'
best_output['filename'] = best_output['file'].str.extract(pattern)

### Some Statistics again


In [None]:
grouped = best_output.groupby('file').count()
grouped = grouped[['animation_id']]
print(f"{grouped.size} Animations left over")
histogram_table = grouped['animation_id'].value_counts().reset_index()
histogram_table.columns = ['Animation Length', 'Count']
histogram_table = histogram_table.sort_values(by='Animation Length')
print(histogram_table)

# Build Output Sequence (Embedded Paths with Animation Vector)

In [None]:
# Merge Dataframes
output_sequence = pd.merge(best_output, embeddings, on=['filename', 'animation_id'], how='inner')
animation_vectors = pd.DataFrame(output_sequence["model_output"].to_list(), columns=["a1","a2","a3","a4","a5","a6","a7","a8","a9","a10","a11","a12"])
output_sequence.drop(['animation_id', 'model_output', 'label', 'rating', 'average_rating', 'filename'], inplace=True, axis=1)
output_sequence['a0'] = 0 # EOS Feature in One-hot Encoding
output_sequence = pd.merge(output_sequence, animation_vectors, left_index=True, right_index=True)

In [None]:
output_sequence