# Grow Performance Predictor (PP) fit size then predict with multiple prods

In this notebook, we will do the following
  - Start with the WebOfScience dataset
  - Use the 90 classes with the highest class accuracy size
    - 90 classes deteremined in notebook: 
      - 'WebOfScience - filter accuracy ordered classes to 0.80 acc and 90 classes.ipynb'
  - Randomize the dataset.
    - Train set is first 4,500 examples in randomized dataset. 
      - Mimics the size of the representative workspace
    - Remaining 29,120 can be used to fit PP.
  - Create base SVC model with the train set.
  - Run the basic PP ShortTextClassificationWrapper varying the PP fit size.
  - Start with 500 and then double the size.
    - 500, 1000, 2000, 4000, 8000, 16000, 29120 (remainder)
  - save the y_pred and y_score with the prod and log examples.
  - Display the SVC and PP accuracies for SVC and each PP run

Split remaining dataset into multiple prods to see the variation of the predictions.

In [1]:
import gzip
from IPython.display import display, HTML
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.utils import shuffle
import time
from typing import List

from uq360.algorithms.blackbox_metamodel.short_text_classification import ShortTextClassificationWrapper

os.environ["PYTHONWARNINGS"] = 'ignore'

pd.options.display.max_colwidth = 100

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

2022-08-10 22:06:28.367390: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-10 22:06:28.367421: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### Load workspace dataset

In [2]:
%%time
# X is input data that include text sequences 
# Y is target value 
# YL1 is target value of level one (parent label)
# YL2 is target value of level one (child label)
x_gzip_file = '../../../data/WebOfScience/WebOfScience/WOS46985/X.txt.gzip'
y_file = '../../../data/WebOfScience/WebOfScience/WOS46985/Y.txt'
yl1_file = '../../../data/WebOfScience/WebOfScience/WOS46985/YL1.txt'
yl2_file = '../../../data/WebOfScience/WebOfScience/WOS46985/YL2.txt'

with gzip.open(x_gzip_file, 'rt') as f:
    lines = f.readlines()
df_x = pd.DataFrame(lines, columns=['example'])
df_y = pd.read_csv(y_file, header=None, names=['intent'])
df_yl1 = pd.read_csv(yl1_file, header=None, names=['yl1'])
df_yl2 = pd.read_csv(yl2_file, header=None, names=['yl2'])
data = [{'intent': 'y', 
         'n unique': len(np.unique(df_y['intent'])),
         'min n uniq': min(np.unique(df_y['intent'], return_counts=True)[1]),
         'max n uniq': max(np.unique(df_y['intent'], return_counts=True)[1])
        },
        {'intent': 'yl1', 
         'n unique': len(np.unique(df_yl1['yl1'])),
         'min n uniq': min(np.unique(df_yl1['yl1'], return_counts=True)[1]),
         'max n uniq': max(np.unique(df_yl1['yl1'], return_counts=True)[1])
        }
]
display(HTML(pd.DataFrame(data).to_html()))

df_merge = pd.concat([df_x, df_y], axis=1, sort=False)
print(f'df_merge.shape = {df_merge.shape}')

x = df_merge['example'].to_numpy()
y = df_merge['intent'].to_numpy().ravel()
print(f'x.shape        = {x.shape}')
print(f'y.shape        = {y.shape}')

# display(HTML(df_merge.head(4).to_html()))

Unnamed: 0,intent,n unique,min n uniq,max n uniq
0,y,134,43,750
1,yl1,7,3297,14625


df_merge.shape = (46985, 2)
x.shape        = (46985,)
y.shape        = (46985,)
CPU times: user 213 ms, sys: 65.8 ms, total: 279 ms
Wall time: 296 ms


#### Encode with USE encoder

In [3]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoded_file = '../../../data/WebOfScience/WebOfScience/WOS46985/X_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_encoded = df.to_numpy()
else:
    encoder = MiniLMEmbedding()
    x_encoded = encoder.encode(x)
    # Save to file
    df = pd.DataFrame(x_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_encoded.shape = {x_encoded.shape}')

x_encoded.shape = (46985, 384)
CPU times: user 4.42 s, sys: 305 ms, total: 4.72 s
Wall time: 4.73 s


#### Determine subset datasert for running experiment
- keep_intents is generated in notebook:
  - 'WebOfScience - filter accuracy ordered classes to 0.80 acc and 90 classes.ipynb'

In [4]:
keep_intents = [64, 122, 12, 113, 62, 49, 66, 2, 68, 45, 103, 97, 70, 48, 115, 98, 3, 57, 61, 8, 74, 47, 127, 112, 65, 31, 99, 9, 79, 114, 35, 63, 111, 94, 101, 92, 46, 100, 69, 93, 96, 42, 25, 60, 39, 106, 121, 44, 33, 109, 14, 130, 81, 53, 17, 58, 71, 132, 80, 0, 83, 37, 55, 90, 85, 32, 75, 105, 22, 38, 56, 41, 128, 5, 21, 84, 43, 54, 36, 77, 27, 131, 72, 73, 118, 7, 108, 23, 26, 124]
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y)) if y[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')
y_sub = y[keep_indices]
x_sub_encoded = x_encoded[keep_indices]
print(f'y_sub.shape         = {y_sub.shape}')
print(f'x_sub_encoded.shape = {x_sub_encoded.shape}')

len(keep_intents) = 90
keep_intents = [64, 122, 12, 113, 62, 49, 66, 2, 68, 45, 103, 97, 70, 48, 115, 98, 3, 57, 61, 8, 74, 47, 127, 112, 65, 31, 99, 9, 79, 114, 35, 63, 111, 94, 101, 92, 46, 100, 69, 93, 96, 42, 25, 60, 39, 106, 121, 44, 33, 109, 14, 130, 81, 53, 17, 58, 71, 132, 80, 0, 83, 37, 55, 90, 85, 32, 75, 105, 22, 38, 56, 41, 128, 5, 21, 84, 43, 54, 36, 77, 27, 131, 72, 73, 118, 7, 108, 23, 26, 124]
len(keep_indices) = 33620
y_sub.shape         = (33620,)
x_sub_encoded.shape = (33620, 384)


#### Randomize the dataset
- Train set is first 4,500 examples in randomized dataset.
  - Mimics the size of the representative workspace
- Remaining 29,120 can be used to fit PP.

In [5]:
x, y = shuffle(x_sub_encoded, y_sub, random_state=42)
print(f'y_sub.shape         = {y_sub.shape}')
print(f'x_sub_encoded.shape = {x_sub_encoded.shape}')
print(f'y.shape             = {y.shape}')
print(f'x.shape             = {x.shape}')

train_size = 4500
# train_size = 500
x_train = x[:train_size]
y_train = y[:train_size]
x_test = x[- (len(y) - train_size):]
y_test = y[- (len(y) - train_size):]
print(f'y_train.shape = {y_train.shape}')
print(f'x_train.shape = {x_train.shape}')
print(f'y_test.shape  = {y_test.shape}')
print(f'x_test.shape  = {x_test.shape}')

y_sub.shape         = (33620,)
x_sub_encoded.shape = (33620, 384)
y.shape             = (33620,)
x.shape             = (33620, 384)
y_train.shape = (4500,)
x_train.shape = (4500, 384)
y_test.shape  = (29120,)
x_test.shape  = (29120, 384)


#### Fit a basic SVM classifier

In [6]:
%%time
def train_model_svm(x, y):
    model = SVC(probability=True)
    model.fit(x, y)    
    return model

model = train_model_svm(x_train, y_train)

CPU times: user 57.3 s, sys: 200 ms, total: 57.5 s
Wall time: 58.8 s


#### Fit the Performance Predictor

In [7]:
%%time
pointwise_features=['confidence_top',
                    'confidence_delta',
                    'confidence_entropy',
                    'class_frequency',
                    'mlp',
                    'svc',
                    'predicted_class',
                    'one_class_svm',
                    'pca']
calibrator = 'isotonic_regression'
metamodels_considered = {'svm': pointwise_features,
                         'gbm': pointwise_features,
                         'mlp': pointwise_features}

experiments = []
for size in [500, 1000, 2000, 4000, 8000, 16000]:
# for size in [100, 200]:
    pp = ShortTextClassificationWrapper(base_model=model, calibrator=calibrator, metamodels_considered=metamodels_considered)
    x_tst = x_test[:size]
    y_tst = y_test[:size]

    # fit PP
    start = time.time()
    pp.fit(x_train, y_train, x_tst, y_tst)
    print(f'{size} dur={time.time() - start} - pp_fitted')

    start = time.time()
    svc_pred = model.predict(x_tst)
    print(f'{size} test dur={time.time() - start} - svc_predict')

    start = time.time()
    pp_accuracy, _, pp_score = pp.predict(x_tst)
    print(f'{size} test dur={time.time() - start} - pp_accuracy  = {pp_accuracy / 100}')

    experiments.append({'dataset': 'test',
                        'pp fit size': len(y_tst),
                        'pp pred size': len(y_tst),
                        'y': y_tst,
                        'svc_pred': svc_pred,
                        'pp_accuracy': pp_accuracy / 100,
                        'pp_score': pp_score[0],
                       })

    for prod_run in range(1, 60):
        x_prod = x_test[size * prod_run:size * (prod_run + 1)]
        y_prod = y_test[size * prod_run:size * (prod_run + 1)]
        start = time.time()
        svc_pred = model.predict(x_prod)
        print(f'{size} prod_{prod_run} dur={time.time() - start} - svc_predict')

        start = time.time()
        pp_accuracy, _, pp_score = pp.predict(x_prod)
        print(f'{size} prod_{prod_run} dur={time.time() - start} - pp_accuracy  = {pp_accuracy / 100}')

        experiments.append({'dataset': f'prod_{prod_run}',
                            'pp fit size': len(y_tst),
                            'pp pred size': len(y_prod),
                            'y': y_prod,
                            'svc_pred': svc_pred,
                            'pp_accuracy': pp_accuracy / 100,
                            'pp_score': pp_score[0],
                           })
        if size * (prod_run + 1) > len(y_test):
            break

Predictor type : text_ensemble
calibrator : isotonic_regression
metamodels considered: {'svm': ['confidence_top', 'confidence_delta', 'confidence_entropy', 'class_frequency', 'mlp', 'svc', 'predicted_class', 'one_class_svm', 'pca'], 'gbm': ['confidence_top', 'confidence_delta', 'confidence_entropy', 'class_frequency', 'mlp', 'svc', 'predicted_class', 'one_class_svm', 'pca'], 'mlp': ['confidence_top', 'confidence_delta', 'confidence_entropy', 'class_frequency', 'mlp', 'svc', 'predicted_class', 'one_class_svm', 'pca']}
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
Balancing data encountered a problem. Using unbalanced data.
Balancing data encountered a problem. Using unbalanced data.
Balancing data encountered a problem. Using unbalanced data.
500 dur=813.2612860202789 - pp_fitted
500 test dur=1.6453895568847656 - svc_predict
Incoming data i

500 prod_21 dur=1.8637526035308838 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_21 dur=4.766974449157715 - pp_accuracy  = 0.6386710523988126
500 prod_22 dur=2.213740348815918 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_22 dur=5.598948955535889 - pp_accuracy  = 0.6355373769950535
500 prod_23 dur=2.157313346862793 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_23 dur=5.3602392673492

500 prod_45 dur=1.6514496803283691 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_45 dur=5.016313791275024 - pp_accuracy  = 0.6403518611776324
500 prod_46 dur=2.4154398441314697 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_46 dur=4.702132225036621 - pp_accuracy  = 0.6319919330441792
500 prod_47 dur=1.5062661170959473 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
500 prod_47 dur=3.89233469963

1000 prod_7 dur=4.534213542938232 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
1000 prod_7 dur=10.410158395767212 - pp_accuracy  = 0.6256961589035206
1000 prod_8 dur=3.397735118865967 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
1000 prod_8 dur=7.981476783752441 - pp_accuracy  = 0.6239158274827536
1000 prod_9 dur=4.917804718017578 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
1000 prod_9 dur=10.065426111221

Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
Balancing data encountered a problem. Using unbalanced data.
Balancing data encountered a problem. Using unbalanced data.
Balancing data encountered a problem. Using unbalanced data.
2000 dur=571.4198591709137 - pp_fitted
2000 test dur=6.401398181915283 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
2000 test dur=14.604142904281616 - pp_accuracy  = 0.6206402060741439
2000 prod_1 dur=6.239683389663696 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', '

4000 prod_4 dur=38.32997918128967 - pp_accuracy  = 0.614950278850181
4000 prod_5 dur=16.478713035583496 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
4000 prod_5 dur=36.393545389175415 - pp_accuracy  = 0.6152916113788877
4000 prod_6 dur=15.172109603881836 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2', 'predicted_class', 'svc_1', 'svc_2']
4000 prod_6 dur=35.06008219718933 - pp_accuracy  = 0.6175965355736696
4000 prod_7 dur=4.12351655960083 - svc_predict
Incoming data is already encoded
Features extracted for : ['class_frequency', 'confidence_delta', 'confidence_entropy', 'confidence_top', 'mlp_1', 'mlp_2', 'one_class_svm', 'pca_1', 'pca_2'

#### Summarize results

In [8]:
def check_correctness(intents, y_predictions, y_scores, threshold=0.9):
    n_correct = 0
    true_high = 0
    true_low = 0
    false_high = 0
    false_low = 0
    n_y = len(intents)
    for y, y_pred, y_score in zip(intents, y_predictions, y_scores):
        if y == y_pred:
            n_correct += 1
            if y_score > threshold:
                true_high += 1
            else:
                true_low += 1
        else:
            if y_score > threshold:
                false_high += 1
            else:
                false_low += 1
    acc = n_correct / n_y
    th = true_high / n_y
    tl = true_low / n_y
    fh = false_high / n_y
    fl = false_low / n_y

    return acc, th, tl, fh, fl

In [9]:
# plt.figure()
data = []
for e in experiments:
    y_true = [1 if y==p else 0 for y,p in zip(e['y'], e['svc_pred'])]
#     fpr, tpr, thresholds = roc_curve(y_true, e['pp_score'], pos_label=1)
    auc = roc_auc_score(y_true, e['pp_score'])
#     plt.plot(fpr, tpr, label=f'{auc:.2%} - {e["pp fit size"]} - {e["dataset"]}')

    acc, th, tl, fh, fl = check_correctness(e['y'],  e['svc_pred'],  e['pp_score'])
    data.append({'dataset': e['dataset'],
                 'pp fit size': e['pp fit size'],
                 'svc accuracy': acc,
                 'pp accuracy': e['pp_accuracy'],
                 'TH': th,
                 'TL': tl,
                 'FH': fh,
                 'FL': fl,
                 'AUC': auc,
    })

pct_fmt = '{:,.2%}'.format
fmt = {col: pct_fmt for col in ['svc accuracy', 'pp accuracy', 'TH', 'TL', 'FH', 'FL', 'AUC']}
display(HTML(pd.DataFrame(data).head(20).to_html(formatters=fmt)))
df = pd.DataFrame(data)

# plt.title('Receiver Operating Characteristic')
# plt.legend(loc='lower right')
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

Unnamed: 0,dataset,pp fit size,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,test,500,62.60%,61.88%,0.00%,62.60%,0.00%,37.40%,90.72%
1,prod_1,500,62.60%,64.13%,0.00%,62.60%,0.00%,37.40%,73.72%
2,prod_2,500,58.00%,63.79%,0.00%,58.00%,0.00%,42.00%,72.16%
3,prod_3,500,59.60%,63.96%,0.00%,59.60%,0.00%,40.40%,70.48%
4,prod_4,500,64.80%,64.03%,0.00%,64.80%,0.00%,35.20%,69.83%
5,prod_5,500,62.40%,64.60%,0.00%,62.40%,0.00%,37.60%,74.63%
6,prod_6,500,66.00%,64.40%,0.00%,66.00%,0.00%,34.00%,73.95%
7,prod_7,500,61.20%,64.11%,0.00%,61.20%,0.00%,38.80%,70.63%
8,prod_8,500,62.00%,63.85%,0.00%,62.00%,0.00%,38.00%,75.00%
9,prod_9,500,64.80%,64.29%,0.00%,64.80%,0.00%,35.20%,73.84%


In [10]:
# pd.DataFrame(data).to_csv('WOS-growPPfitSize-diffProd.csv', index=False)

In [11]:
# df = pd.read_csv('WOS-growPPfitSize-diffProd.csv')

# pct_fmt = '{:,.2%}'.format
# fmt = {col: pct_fmt for col in ['svc accuracy', 'pp accuracy', 'TH', 'TL', 'FH', 'FL', 'AUC']}
# display(HTML(df.to_html(formatters=fmt)))

In [12]:
df_test = df.loc[df['dataset'] == 'test']
df_prod = df.loc[df['dataset'] != 'test']
df_test.reset_index(inplace=True, drop=True)
display(HTML(df_test.to_html()))
display(HTML(df_prod.head().to_html()))

Unnamed: 0,dataset,pp fit size,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,test,500,0.626,0.618847,0.0,0.626,0.0,0.374,0.907212
1,test,1000,0.626,0.622911,0.028,0.598,0.0,0.374,0.855594
2,test,2000,0.607,0.62064,0.1325,0.4745,0.0,0.393,0.955779
3,test,4000,0.6215,0.624141,0.16275,0.45875,0.0,0.3785,0.957341
4,test,8000,0.6235,0.623957,0.19925,0.42425,0.01175,0.36475,0.816682
5,test,16000,0.62375,0.631098,0.193938,0.429812,0.014563,0.361687,0.805379


Unnamed: 0,dataset,pp fit size,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
1,prod_1,500,0.626,0.641316,0.0,0.626,0.0,0.374,0.737173
2,prod_2,500,0.58,0.637889,0.0,0.58,0.0,0.42,0.721576
3,prod_3,500,0.596,0.639647,0.0,0.596,0.0,0.404,0.704831
4,prod_4,500,0.648,0.640278,0.0,0.648,0.0,0.352,0.69832
5,prod_5,500,0.624,0.646013,0.0,0.624,0.0,0.376,0.746326


In [13]:
# display the
# - test pp prediction
# - multiple prod pp predictions (mean, min & max)
pct_fmt = '{:,.2%}'.format
fmt = {col: pct_fmt for col in ['svc accuracy', 'pp accuracy', 'TH', 'TL', 'FH', 'FL', 'AUC']}

display(HTML(df_test.to_html(formatters=fmt)))

df_mean = df_prod.groupby(['pp fit size']).mean()
df_mean.reset_index(inplace=True)
display(HTML(df_mean.to_html(formatters=fmt)))

df_min = df_prod.groupby(['pp fit size']).min()
df_min.reset_index(inplace=True)
display(HTML(df_min.to_html(formatters=fmt)))


df_max = df_prod.groupby(['pp fit size']).max()
df_max.reset_index(inplace=True)
display(HTML(df_max.to_html(formatters=fmt)))

Unnamed: 0,dataset,pp fit size,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,test,500,62.60%,61.88%,0.00%,62.60%,0.00%,37.40%,90.72%
1,test,1000,62.60%,62.29%,2.80%,59.80%,0.00%,37.40%,85.56%
2,test,2000,60.70%,62.06%,13.25%,47.45%,0.00%,39.30%,95.58%
3,test,4000,62.15%,62.41%,16.28%,45.88%,0.00%,37.85%,95.73%
4,test,8000,62.35%,62.40%,19.93%,42.43%,1.18%,36.48%,81.67%
5,test,16000,62.38%,63.11%,19.39%,42.98%,1.46%,36.17%,80.54%


Unnamed: 0,pp fit size,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,500,62.35%,63.96%,0.00%,62.35%,0.00%,37.65%,73.37%
1,1000,62.25%,62.53%,3.21%,59.03%,0.23%,37.52%,77.07%
2,2000,62.51%,60.33%,0.57%,61.94%,0.04%,37.45%,78.77%
3,4000,62.35%,61.65%,0.79%,61.55%,0.03%,37.62%,79.00%
4,8000,62.52%,62.19%,19.51%,43.01%,1.41%,36.06%,79.89%
5,16000,62.49%,62.87%,19.08%,43.41%,1.39%,36.11%,79.78%


Unnamed: 0,pp fit size,dataset,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,500,prod_1,56.67%,63.13%,0.00%,56.67%,0.00%,33.20%,68.92%
1,1000,prod_1,56.67%,60.22%,2.60%,53.33%,0.00%,34.30%,72.53%
2,2000,prod_1,61.05%,59.92%,0.40%,60.35%,0.00%,35.80%,76.67%
3,4000,prod_1,61.25%,61.44%,0.60%,60.36%,0.00%,36.48%,78.06%
4,8000,prod_1,62.18%,61.94%,19.20%,42.55%,1.21%,35.43%,79.38%
5,16000,prod_1,62.49%,62.87%,19.08%,43.41%,1.39%,36.11%,79.78%


Unnamed: 0,pp fit size,dataset,svc accuracy,pp accuracy,TH,TL,FH,FL,AUC
0,500,prod_9,66.80%,64.94%,0.00%,66.80%,0.00%,43.33%,77.88%
1,1000,prod_9,65.50%,63.95%,4.10%,61.40%,0.60%,43.33%,80.15%
2,2000,prod_9,64.15%,61.23%,0.80%,63.55%,0.10%,38.85%,80.70%
3,4000,prod_7,63.48%,61.90%,0.90%,62.70%,0.09%,38.66%,79.66%
4,8000,prod_3,62.99%,62.37%,19.85%,43.50%,1.58%,36.61%,80.30%
5,16000,prod_1,62.49%,62.87%,19.08%,43.41%,1.39%,36.11%,79.78%
