In [None]:
import sys
sys.path.append('..')
import os
from itertools import product
from tqdm import tqdm
import pickle
import math
import numpy as np
import pandas as pd

from sklearn.manifold import MDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist, pdist

import plotly.express as px    
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
# pio.kaleido.scope.mathjax = None

In [None]:
TASK = ['f_res']

In [None]:
data = pd.read_csv(os.path.join('..', 'data', 'storage', 'SIM-TRC-3', 'data.csv'), sep=';')
data = data.dropna()

for n in range(3):
    data.loc[data[f'y_{n}']==0, f'x_{n}'] = 0
    
data['area']= data['x_0']*data['y_0'] + data['x_1']*data['y_1'] + data['x_2']*data['y_2']


x_cols = ['x_0', 'x_1', 'x_2', 'y_0', 'y_1', 'y_2']
x = data[x_cols]
y = data[TASK]

In [None]:
def scale_data(x, y):
    scaler_x = StandardScaler()
    scaler_x = scaler_x.fit(np.array(x.fillna(0)))
    scaler_y = MinMaxScaler()
    scaler_y = scaler_y.fit(np.array(y.fillna(0)))
    return scaler_x , scaler_y
    
def calc_pca(x, y):
    scaler_x , scaler_y = scale_data(x, y)
    PCA_x = PCA(n_components = 0.9 ,svd_solver = 'full', random_state=1)
    PCA_result = PCA_x.fit_transform(scaler_x.transform(x))
    return pd.DataFrame(PCA_result, index=x.index)

def calc_pca_dist(x,y, dist_metric='euclidean'):
    PCA_result = np.array(calc_pca(x,y))
    pca_dist_matrix = cdist(PCA_result, PCA_result, dist_metric)
    pca_dist_data = np.partition(pca_dist_matrix, 10)
    pca_dist_data = np.sum(pca_dist_data[:, :10], axis=1)
    #pca_dist_data = np.quantile(pca_dist_data, q=0.9) / 10
    return pca_dist_data, pca_dist_matrix

def is_pareto_efficient(costs, direction):
    is_efficient = np.ones(costs.shape[0], dtype=bool)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = np.any((costs[is_efficient] * direction) < (c * direction), axis=1)
            is_efficient[i] = True  # Keep self
    return is_efficient

In [None]:
def rand_train_test_split(x, y, seed):
    y = y.loc[x.index]
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=math.ceil(len(x)*0.8), random_state=seed+42)
    return x_train, x_test, y_train, y_test

def dist_train_test_split(x, y, seed, dist_metric='euclidean'):
    x['PCA'], _ = calc_pca_dist(x, y,  dist_metric=dist_metric)
    # Calculate the threshold value for the top 20%
    threshold = x['PCA'].quantile(0.8)
    x_train =  x[x['PCA'] <= threshold]
    x_test = x[~x.index.isin(x_train.index)]
    y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]   
    return x_train[x_cols], x_test[x_cols], y_train, y_test

def extra_train_test_split(x, y, seed):
    y = y[TASK[0]]
    lower_threshold, upper_threshold = y.quantile(0.1), y.quantile(0.9)
    y_train = y[y.between(lower_threshold, upper_threshold, inclusive='both')]
    y_test = y[~y.index.isin(y_train.index)]
    x_train, x_test = x.loc[y_train.index], x.loc[y_test.index]    
    return x_train[x_cols], x_test[x_cols], y_train, y_test

def pareto_train_test_split(x, y, seed):
    y_data = y.copy()
    train_size = math.ceil(len(x)*0.8)
    y_data['area'] = data['area']
    direction = [-1, 1]
    test_index = pd.Index([])
    train_index = y_data.index
    while len(train_index) > train_size:
        costs = np.array(y_data)
        pareto_index = is_pareto_efficient(costs, direction=direction)
        pareto_index = y_data[pareto_index].index
        if len(train_index) - len(pareto_index) < train_size:
            rest_length = len(train_index) - train_size
            pareto_index = pd.Index(pd.Series(pareto_index.to_list()).sample(n=rest_length))
        test_index = test_index.append(pareto_index)
        y_data = y_data.drop(pareto_index, axis=0)
        train_index = y_data.index.difference(test_index)
        #print(len(train_index))
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    return x_train[x_cols], x_test[x_cols], y_train[TASK], y_test[TASK]

In [None]:
def mds_scaling(x, y, train_index, test_index):     
    # PCA dist
    pca_dist_data, pca_dist_matrix = calc_pca_dist(x,y)
    # Multidimensional Scaling (MDS)
    mds = MDS(n_components=DIMENSIONS, dissimilarity='precomputed', random_state=1)
    embedded_points = mds.fit_transform(pca_dist_matrix)
    df = pd.DataFrame(embedded_points, index = x.index)
    df.loc[train_index, 'label'], df.loc[test_index, 'label'] = 'train', 'test'
    return df

In [None]:
color_map = {'train': '#00305e', 'test': '#3cc398'}
layout = go.Layout(xaxis=dict(title_font={'size': 20}, tickfont={'size': 16}, zerolinecolor='black',
                              linecolor='black', gridcolor='#cccccc'),
                   yaxis=dict(title_font={'size': 20}, tickfont={'size': 16}, zerolinecolor='black',
                              linecolor='black',  gridcolor='#cccccc'),
                   plot_bgcolor='white',
                   # xaxis_title=target.replace('_',' '),
                   legend=dict(title= 'subset', traceorder='reversed', font_size=20, orientation="v",
                               y=0.55, xanchor='center', x=1.1),
                   width=750, height=750,
                   
                  )

def plot_dimensionality_reduction(df, num_dimensions, layout=layout, color_map=color_map, save_fig=None):
    # gets df with 0,1,(2) the 2/3D                      
    # Create scatter plot using Plotly Express
    if num_dimensions == 2:
        fig = px.scatter(
            x=df[0],
            y=df[1],
            color = df['label'],
            color_discrete_map=color_map,
            size_max=10,
            opacity=1,
            labels={'x': 'Component 1', 'y': 'Component 2'})
    elif num_dimensions == 3:
        fig = px.scatter_3d(
            x=df[0],
            y=df[1],
            z=df[2],
            color=df['label'],
            color_discrete_map=color_map,
            opacity=0.8,
            title="3D Scatter Plot",
            labels={'x': 'Component 1', 'y': 'Component 2', 'z': 'Component 3'}
        )
    else:
        raise ValueError("Number of dimensions must be 2 or 3.")
    fig.update_layout(layout)
    fig.update_layout(xaxis=dict(dtick=1))
    fig.update_layout(yaxis=dict(dtick=1))
    fig.update_traces(marker_size=8)
    if save_fig:
        fig.write_image(save_fig)
    #fig.show()
    return fig

def plot_pareto_front(df, layout=layout, color_map=color_map, save_fig=None):                    
    # Plot the scatter plot with two colors
    fig = px.scatter(
        x=df['area'],
        y=df['f_res'],
        color = df['label'],
        color_discrete_map=color_map,
        labels={'x': 'Area [mm²]', 'y': 'Max Load Capacity [kN]'}
    )

    fig.update_layout(layout)
    fig.update_layout(xaxis=dict(dtick=10))
    fig.update_layout(yaxis=dict(dtick=20))
    fig.update_layout(xaxis_range=[7,65])
    fig.update_layout(yaxis_range=[0,120])
    fig.update_traces(marker_size=8)
    if save_fig:
            fig.write_image(save_fig)
    #fig.show()
    return fig

In [None]:
DIMENSIONS = 2

from plotly.subplots import make_subplots
plot_dict = dict()
for split_func in [rand_train_test_split, dist_train_test_split, extra_train_test_split, pareto_train_test_split]:
    x_train, x_test, y_train, y_test = split_func(x, y, 1)
    df = mds_scaling(x, y, x_train.index, x_test.index)
    fig = plot_dimensionality_reduction(df.dropna(), 2, save_fig=f'pca_{split_func.__name__}.pdf')
    plot_dict[f'pca_{split_func.__name__}'] = fig
    df = data
    data['label'] = np.where(data.index.isin(x_train.index), 'train', 'test')
    fig = plot_pareto_front(df, layout=layout, color_map=color_map, save_fig=f'pareto_front_{split_func.__name__}.pdf')
    plot_dict[f'pareto_front_{split_func.__name__}'] = fig

In [None]:
num_plots = len(plot_dict)
cols = 4
rows = (num_plots + 1) // cols 
order = 'col' #'col'

subplot = make_subplots(rows=rows, cols=cols, shared_yaxes=True, horizontal_spacing=0.03, vertical_spacing=0.1, 
                        subplot_titles=['random sampling', 'pca_sampling',  'extra_sampling', 'pareto_sampling', '', '', '', ''])
original_order = np.arange(num_plots)+1

i = 1
for key, value in plot_dict.items():
    if order == 'row': 
        row_index = ((i-1) // cols) + 1
        col_index = cols - (i % cols)
    if order == 'col': 
        row_index = rows - (i % rows) 
        col_index = ((i-1) // rows) + 1
        
    for trace in value.data:
        if i!=1:
            trace.update(showlegend=False)
        subplot.add_trace(trace, row=row_index, col=col_index)
        

    if order == 'row':
        j=i
    if order == 'col':
        new_order = [original_order[i // 2] if i % 2 == 0 else original_order[len(original_order) // 2 + i // 2] for i in range(len(original_order))]
        j = new_order[i-1]        
    
    subplot.update_layout({f'xaxis{j}': dict(title=value.layout.xaxis['title'],
                                             dtick=value.layout.xaxis['dtick'],
                                             tickfont=value.layout.xaxis['tickfont'],
                                             range=value.layout.xaxis['range'], 
                                             title_font=value.layout.xaxis['title_font'], 
                                             gridcolor=value.layout.xaxis['gridcolor'],
                                             zerolinecolor=value.layout.xaxis['zerolinecolor'],
                                             linecolor=value.layout.xaxis['linecolor'] 
                                            )}) 
    subplot.update_layout({f'yaxis{j}': dict(title=value.layout.yaxis['title'],
                                             dtick=value.layout.yaxis['dtick'],
                                             tickfont=value.layout.yaxis['tickfont'],
                                             range=value.layout.yaxis['range'], 
                                             title_font=value.layout.yaxis['title_font'], 
                                             gridcolor=value.layout.yaxis['gridcolor'],
                                             zerolinecolor=value.layout.yaxis['zerolinecolor'],
                                             linecolor=value.layout.yaxis['linecolor'] 
                                            )}) 
    i+=1


# Update layout and display the subplot
subplot.update_layout(
    width=1200,  # Width of the entire subplot
    height=700,  # Height of the entire subplot
    plot_bgcolor='white',
)
#subplot.write_image('splits.pdf

subplot.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.6, font_size=20
))

subplot.write_image('splits_overview.pdf')
subplot.show()