In [None]:
import os
import json
from zipfile import ZipFile
import itertools
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import PowerTransformer
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm

from cycler import cycler
import seaborn as sb
from matplotlib.colors import LinearSegmentedColormap

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda, 
    extraction,
    visualize,
    models
)

plt.rcParams.update({'font.size': 14})

In [None]:
PARTS = 1
FFT_WINDOW = 2**15
EXTRACT = False
GENERATE = False
POWER_TRANSFORM = False
MODEL_TYPE = 'knn'

K_NEIGHBORS_OPTIONS = (3, 5, 11)
NUM_FEATURES_OPTIONS = (2, 3, 4)

In [None]:
PATH = '../datasets'
FEATURES_PATH = os.path.join(PATH, 'features')
RESULTS_PATH = os.path.join(FEATURES_PATH, f'results#{POWER_TRANSFORM}.json')
BEST_FEATURES_PATH = os.path.join(FEATURES_PATH, 'best_features_accuracy.csv')
DATASET_PATH = os.path.join(PATH, 'MAFAULDA.zip')
LABELED_DATASET_PATH = os.path.join(FEATURES_PATH, 'MAFAULDA_LABEL.csv')
KNN_BRUTE_FORCE_PATH = os.path.join(PATH, 'knn-accuracy-distribution')
ONLINE_GRADUAL_PATH = os.path.join(PATH, 'knn-incremental-accuracy')

FEATURES = {
    'TD': os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv'),
}

In [None]:
def make_filename(row: dict, separator: str = '#'):
    parts = [
        row['domain'],
        row['dim'],
        row['bearing'],
        row['severity'],
        POWER_TRANSFORM
    ]
    filename = separator.join([str(p) for p in parts])
    return filename

#### Display example severities

In [None]:
df = extraction.load_features(FEATURES['TD'], mafaulda.BEARING_A_COLUMNS, mafaulda.LABEL_COLUMNS) 
df = mafaulda.label_severity(df, 'A', 0.5, True)

#### Generate different feature sets

In [None]:
datasets = []
domains = ('TD', 'FD')
dimensions = (1, 3)
columns = {
    'A': {
        1: ['ay'],
        3: mafaulda.BEARING_A_COLUMNS
    },
    'B': {
        1: ['by'],
        3: mafaulda.BEARING_B_COLUMNS
    }
}

for domain in domains:
    for dim in dimensions:
        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.mark_severity(a, 'A')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': False, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.mark_severity(b, 'B')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': False, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': False, 'data': ab})

        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.label_severity(a, 'A', 0.5, keep=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': True, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.label_severity(b, 'B', 0.5, keep=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': True, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': True, 'data': ab})


datasets_raw_dict = [d.copy() for d in datasets]
datasets_raw = pd.DataFrame.from_records(datasets_raw_dict)
datasets_raw['data'][0]

In [None]:
datasets_raw

In [None]:
datasets_online = [d.copy() for d in datasets_raw_dict]
for row in datasets_online:
    df = row['data']
    groups = [
        frame.sample(frac=1, random_state=10)
        for i, frame in (
            df.sort_values(by='severity_level')
              .groupby('severity_level')
        )
    ]
    rows = list(pd.concat(groups).index)
    df = df.loc[rows].reset_index(drop=True).sort_index()
    if row['severity'] is True:
        df.loc[df['severity_level'] < 0.5, 'label'] = 'normal'
    visualize.evolution_of_severity_levels(df)
    row['data'] = mafaulda.clean_columns(df)

datasets_online = pd.DataFrame.from_records(datasets_online)
datasets_online['data'][0]

In [None]:
df = datasets_online[
    (datasets_online['bearing'] == 'A') &
    (datasets_online['severity'] == False)
].head(1)
x = visualize.plot_label_occurences(df['data'].values[0]['label'])

In [None]:
df = datasets_online[
    (datasets_online['bearing'] == 'A') &
    (datasets_online['severity'] == True)
].head(1)
x = visualize.plot_label_occurences(df['data'].values[0]['label'])

In [None]:
for row in datasets:
    df = row['data']
    if row['severity'] is True:
        df.loc[df['severity_level'] < 0.5, 'label'] = 'normal'
    row['data'] = mafaulda.clean_columns(df)
    
datasets = pd.DataFrame.from_records(datasets)
datasets = datasets[datasets['domain'].isin(['TD', 'FD'])]
datasets['data'][0]

#### Number of members in each class

In [None]:
label_counts = []
for name, group in datasets.groupby(by=['severity', 'bearing']):
    severity, bearing = name
    df = group['data'].values[0]
    scenario = {'bearing': bearing, 'severity': severity}
    counts = df['label'].value_counts().to_dict()
    counts['sum'] = sum(counts.values())
    scenario.update(counts)
    label_counts.append(scenario)

pd.DataFrame.from_records(label_counts)

#### Range of values in features

In [None]:
for name, group in datasets.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i], color='black')
    print(df.describe())
    fig.tight_layout()
    plt.show()

#### Range of values in features - Power transform

In [None]:
for name, group in datasets.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    df[df.columns] = pt.fit_transform(df)

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i], color='black')
    fig.tight_layout()
    plt.show()

#### Correlation of features to RPM

In [None]:
td_columns = datasets[datasets['domain'] == 'TD']['data'].head(1).values[0].drop(columns=['label']).columns.to_list()
fd_columns = datasets[datasets['domain'] == 'FD']['data'].head(1).values[0].drop(columns=['label']).columns.to_list()

datasets_corr = []
for row in datasets_raw_dict.copy():
    row = row.copy()
    df = row.pop('data')
    domain = row['domain']
    columns = td_columns if domain == 'TD' else fd_columns
    for col in df[columns]:
        row[f'{domain}-{col}'] = np.corrcoef(df[col], df['rpm'])[0, 1]
    
    datasets_corr.append(row)

datasets_corr_domains = []
datasets_corr_domains.append(pd.DataFrame.from_records([d for d in datasets_corr if d['domain'] == 'TD']).drop(columns=['domain']))
datasets_corr_domains.append(pd.DataFrame.from_records([d for d in datasets_corr if d['domain'] == 'FD']).drop(columns=['domain']))
datasets_corr_domains = pd.concat(datasets_corr_domains, axis=1)
datasets_corr_domains.T

In [None]:
datasets_corr_domains.describe().T

In [None]:
df = datasets_corr_domains[
    datasets_corr_domains.columns[
        ~datasets_corr_domains.columns.isin(('dim', 'bearing', 'severity'))
    ]
]
# Sort features by mean
df = df.T.assign(m=df.T.mean(axis=1)).sort_values('m', ascending=False).drop('m', axis=1).T
df.boxplot(figsize=(10, 5), color='black', notch=True)
plt.xlabel('Feature')
plt.ylabel('Correlation')
plt.xticks(rotation=90)
plt.show()

#### Correlation of features among themselves

In [None]:
cmap = LinearSegmentedColormap.from_list('', ['black', 'white', 'black'])

for name, group in datasets.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])
    print(name)
    columns = list(df.columns)
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    sb.heatmap(df[columns].corr(), cmap=cmap, vmin=-1, vmax=1, annot=True, ax=ax, fmt='.0%')
    fig.tight_layout()
    plt.show()

#### Neighbourhood of same class

In [None]:
def neighborhood_of_same_class(severity: bool, bearing: str):
    frame = {}
    for domain in domains: 
        df = datasets[
            (datasets['dim'] == 3) & 
            (datasets['domain'] == domain) & 
            (datasets['bearing'] == bearing) & 
            (datasets['severity'] == severity)]['data'].values[0]

        dx = df.drop(columns='label')
        rows = {}

        for k in range(2, 100, 2):
            neigh = NearestNeighbors(n_neighbors=k).fit(dx)
            distances, indices = neigh.kneighbors(dx)
            count = 0
            for idx, nearby in enumerate(indices):
                my_label = df.iloc[idx]['label']
                majority_label = df.iloc[nearby]['label'].value_counts().sort_values(ascending=False).index.values[0]
                # majority label is same as mine
                if my_label == majority_label:
                    count += 1
            rows[k - 1] = ((count / len(dx)) * 100)

        frame[domain] = rows

    return pd.DataFrame.from_records(frame)

In [None]:
df = neighborhood_of_same_class(False, 'A')
df.plot(grid=True, xlabel='Majority of neighbours with same label', ylabel='Fraction of dataset [%]')
plt.show()

In [None]:
df = neighborhood_of_same_class(True, 'A')
df.plot(grid=True, xlabel='Majority of neighbours with same label', ylabel='Fraction of dataset [%]')
plt.show()

#### All features on each scenario

In [None]:
results = []
if GENERATE is True:
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])
        r = models.all_features(
            x, y, power_transform=POWER_TRANSFORM
        )
        r.update({
            'domain': row['domain'],
            'dim': row['dim'],
            'bearing': row['bearing'],
            'severity': row['severity']
        })
        results.append(r)
    json.dump(results, open(RESULTS_PATH, 'w'))

results = json.load(open(RESULTS_PATH, 'r'))
results[:1]

In [None]:
results = json.load(open(RESULTS_PATH, 'r'))
y = []
for x in results:
    d = dict(zip(x['k'], x['test']))
    for key in ('k', 'train', 'test'):
        x.pop(key)
    x.update(d)
    y.append(x)

k_all_features_test = pd.DataFrame.from_records(y)
k_all_features_test = k_all_features_test[k_all_features_test['domain'].isin(('TD', 'FD'))]
k_all_features_test

In [None]:
def compare_all_features_k_accuracies(results: pd.DataFrame, domain: str, severity: bool):
    df = results.drop(columns=['severity'])
    df = df.set_index(['domain', 'dim', 'bearing']).T
    ax = df.plot(figsize=(12, 6), marker='v')

    ax.set_ylabel('Accuracy')
    ax.set_xlabel('K-neighbors')
    ax.grid(True)
    plt.show()

for (domain, severity), group in k_all_features_test.groupby(by=['domain', 'severity']):
    print(domain, severity)
    compare_all_features_k_accuracies(group, domain, severity)

#### Enumerate feature combinations on each scenario

In [None]:
results = []
Path(KNN_BRUTE_FORCE_PATH).mkdir(parents=True, exist_ok=True)

if GENERATE is True:
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])

        result = models.enumerate_models(
            x, y, row['domain'],
            power_transform=POWER_TRANSFORM,
            k_neighbors=K_NEIGHBORS_OPTIONS,
            num_of_features=NUM_FEATURES_OPTIONS
        )
        filename = make_filename(row)
        filename = os.path.join(KNN_BRUTE_FORCE_PATH, filename)
        result.to_csv(filename, index=False)

In [None]:
for filename in os.listdir(KNN_BRUTE_FORCE_PATH):
    if not os.path.isfile(os.path.join(KNN_BRUTE_FORCE_PATH, filename)):
        continue
    models_summary = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))
    print(filename)
    visualize.boxplot_enumerate_models_accuracy(models_summary, 'test', 'f', 'k')
    visualize.boxplot_enumerate_models_accuracy(models_summary, 'test', 'k', 'f')

#### Bar chart for specfic number of features

In [None]:
if GENERATE is True:
    results = []
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])
        filename = make_filename(row)
        models_summary = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))

        for fnum in NUM_FEATURES_OPTIONS:
            for k in K_NEIGHBORS_OPTIONS:
                result = models.feature_selection_accuracies(
                    x, y, 
                    row['domain'],
                    models_summary,
                    k_neighbors=k,
                    number_of_features=fnum, 
                    power_transform=POWER_TRANSFORM
                )
                for r in result:
                    r.update({
                        'dim': row['dim'],
                        'bearing': row['bearing'],
                        'severity': row['severity'],
                        'k': k,
                        'f': fnum
                    })
                results.extend(result)
    
    results = pd.DataFrame.from_records(results)
    results.to_csv(BEST_FEATURES_PATH, index=False)
else:
    results = pd.read_csv(BEST_FEATURES_PATH)
results

In [None]:
frame = results[
    (results['f'] == 3) &
    (results['k'] == 5)
]
frame.to_csv('../datasets/mafaulda_knn_feature_selection.csv', index=False)
frame

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k', 'f']):
    print(name)
    visualize.plot_models_performance_bar(group)

#### Accuracy and percentile of feature selection methods to number of features
- The best features don't have always 100 percentile. The best subset is taken after sorting training set and distribution of accuracies is from validation set.

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k']):
    print(name)
    (group[['f', 'set', 'test_accuracy']]
     .pivot(index='f', columns='set', values='test_accuracy')
     .plot(figsize=(8, 5), marker='o', grid=True, xlabel='Number of features', ylabel='Accuracy'))
    plt.xticks(NUM_FEATURES_OPTIONS)
    plt.show()

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k']):
    print(name)
    (group[['f', 'set', 'test_percentile']]
     .pivot(index='f', columns='set', values='test_percentile')
     .plot(figsize=(8, 5), marker='o', grid=True, xlabel='Number of features', ylabel='Percentile'))
    plt.xticks(NUM_FEATURES_OPTIONS)
    plt.show()

#### Accuracy and percentile of feature selection methods to number of features

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'f']):
    print(name)
    (group[['k', 'set', 'test_accuracy']]
     .pivot(index='k', columns='set', values='test_accuracy')
     .plot(figsize=(8, 6), marker='o', grid=True, xlabel='k-Neighbors', ylabel='Accuracy'))
    plt.xticks(K_NEIGHBORS_OPTIONS)
    plt.show()

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'f']):
    print(name)
    (group[['k', 'set', 'test_percentile']]
     .pivot(index='k', columns='set', values='test_percentile')
     .plot(figsize=(8, 6), marker='o', grid=True, label='k-Neighbors', ylabel='Percentile'))
    plt.xticks(K_NEIGHBORS_OPTIONS)
    plt.show()

Distribution of percentiles reached by feature selection methods in various subsets

In [None]:
params = dict(
    grid=True,
    bins=20,
    edgecolor='black',
    linewidth=0.5,
    alpha=0.5
)

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
for method in ('Rank product', 'Mutual information', 'F statistic', 'Correlation'):
    df = results[results['set'] == method]
    print(method, df['test_percentile'].median())
    sb.kdeplot(data=df, x='test_percentile', bw_adjust=0.2, label=method, ax=ax)
    ax.set_xlabel('Percentile')
    ax.grid(True)
plt.legend()
plt.show()

In [None]:
params = dict(
    grid=True,
    bins=20,
    edgecolor='black',
    linewidth=0.5,
    alpha=0.5
)

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
for method in ('Rank product', 'Mutual information', 'F statistic', 'Correlation'):
    df = results[results['set'] == method]
    print(method, df['test_accuracy'].median())
    sb.kdeplot(data=df, x='test_accuracy', bw_adjust=0.8, label=method, ax=ax)
    ax.set_xlabel('Accuracy')
    ax.grid(True)
plt.legend()
plt.show()

#### In how many cases is rank product best among all the selection methods?

In [None]:
def count_wins_of_methods(datasets, results, methods):
    columns = list(set(datasets.columns) - {'data'}) + ['k', 'f']
    counts = dict(zip(methods + ['total'], np.zeros(len(methods)+1)))
    percentiles = dict(zip(methods, [[] for _ in range(len(methods))]))
    
    for name, group in results.groupby(by=columns):
        counts['total'] += 1
        row = (
            group[group['set'].isin(methods)]
            .sort_values(by='test_percentile', ascending=False)
            .head(1)
        )
        key = row.head(1)['set'].values[0]
        percentile = row.head(1)['test_percentile'].values[0]

        percentiles[key].append(percentile)
        counts[key] += 1
        
    percentiles = {k: np.mean(v or [0]) for k, v in percentiles.items()}
    percentiles = pd.DataFrame.from_dict(percentiles, orient='index', columns=['score'])
        
    counts = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
    counts['percentage'] = 100 * (counts['count'] / counts[counts.index == 'total'].values[0])
    counts = counts.join(percentiles)
    return counts

In [None]:
methods = list(set(results['set'].unique()) - {'PCA PC', 'All features', 'Best features'})
count_wins_of_methods(datasets, results, methods)

In [None]:
methods = list(set(results['set'].unique()) - {'All features', 'Best features'})
count_wins_of_methods(datasets, results, methods)

In [None]:
methods = list(set(results['set'].unique()) - {'Best features'})
count_wins_of_methods(datasets, results, methods)

#### Histogram of model accuracy distribution and vertical line for individual feature selection methods
- dim=3, bearing=A, severity=False, k=5, f=3

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 9))
for i, domain in enumerate(FEATURES):
    row = {
        'domain': domain,
        'dim': 3,
        'bearing': 'A',
        'severity': False,
        'k': 5,
        'f': 3
    }
    filename = make_filename(row)
    print(filename)
    distribution = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))
    distribution = distribution[
        (distribution['k'] == row['k']) &
        (distribution['f'] == row['f'])
    ]

    mselection = results[
        (results['domain'] == row['domain']) &
        (results['dim'] == row['dim']) &
        (results['bearing'] == row['bearing']) &
        (results['severity'] == row['severity']) &
        (results['k'] == row['k']) &
        (results['f'] == row['f'])
    ]
    params = dict(
        grid=True,
        bins=30,
        edgecolor='black',
        linewidth=0.5,
        color='gray',
        range=(0.6, 1)
    )
    colors = ['r', 'g', 'b', 'orange', 'm', 'y', 'k']

    params['ax'] = ax[0][i]
    params['ax'].set_xlabel('Accuracy')
    params['ax'].set_ylabel('Number of k-NN models')
    params['ax'].set_title(visualize.DOMAIN_TITLES[domain] + ', Training set')
    distribution['train'].hist(**params)

    sel = mselection.set_index('set').to_dict()['train_accuracy']
    color_cycle = cycler(color=colors)
    for c, (k, v) in zip(color_cycle, sel.items()):
        params['ax'].axvline(v, linestyle='-', lw=2, label=f'{k} ({v*100:.1f})', alpha=0.8, **c) 
    params['ax'].legend()

    params['ax'] = ax[1][i]
    params['ax'].set_xlabel('Accuracy')
    params['ax'].set_ylabel('Number of k-NN models')
    params['ax'].set_title(visualize.DOMAIN_TITLES[domain] + ', Testing set')
    distribution['test'].hist(**params)

    sel = mselection.set_index('set').to_dict()['test_accuracy']
    color_cycle = cycler(color=colors)
    for c, (k, v) in zip(color_cycle, sel.items()):
        params['ax'].axvline(v, linestyle='-', lw=2, label=f'{k} ({v*100:.1f})', alpha=0.8, **c) 
    params['ax'].legend()

plt.tight_layout()
plt.show()

In [None]:
datasets_online

#### Online gradual learning with step 1
k = 5, f = 3
(90 minutes)

In [None]:
def make_filename_incremental(domain: str, severity: bool, window_len: int, learn_skip: int):
    return f'{domain}#{severity}#{window_len}#{learn_skip}.csv'


def parse_filename_incremental(filename: str) -> tuple:
    return filename.rstrip('.csv').split('#')


def incremental_learning(num_of_features: int, folder: str, window_len: int, learn_skip: int):
    Path(folder).mkdir(parents=True, exist_ok=True)

    for (domain, severity), group in tqdm(datasets_online.groupby(by=['domain', 'severity'])):
        print(domain, severity)
        online_scenarios = group[
            (group['bearing'] == 'A') &
            (group['dim'] == 3)
        ]

        stream = online_scenarios['data'].values[0]
        columns = stream.drop(columns=['label']).columns
        # print(stream[stream['label'] == 'normal'].tail(1).index, len(stream))

        results = []
        for features in list(itertools.combinations(columns, r=num_of_features)):
            x = stream[list(features)].copy()
            y = stream['label'].copy()
            r = models.knn_online_learn(x, y, window_len=window_len, learn_skip=learn_skip)
            r = {v['step']: v['accuracy'] for v in r.to_dict('records')}
            results.append(r)

        m = pd.DataFrame.from_records(results)
        m.to_csv(os.path.join(folder, make_filename_incremental(domain, severity, window_len, learn_skip)), index=False)


def load_incremental_distributions(win_desired: List[int], skip: List[int]):
    graphs = []
    for filename in os.listdir(ONLINE_GRADUAL_PATH):
        path = os.path.join(ONLINE_GRADUAL_PATH, filename)
        domain, severity, win, skip = parse_filename_incremental(filename)

        if not os.path.isfile(path):
            continue
        if not (int(win) in win_desired and int(skip) in skip_desired):
            continue

        m = pd.read_csv(path)
        steps = m.T.index.astype(int).to_numpy()
        graphs.append({
            'domain': domain,
            'severity': severity,
            'win': win,
            'skip': skip,
            'steps': steps,
            'lower_boundary': m.min().to_numpy(),
            'upper_boundary': m.max().to_numpy(),
            'middle_line': m.median().to_numpy()
        })

    return pd.DataFrame.from_records(graphs)


def plot_model_evolution(d: dict, ax, color='gray', line_color='darkgreen', alpha=0.5, label=None):
    print(
        d['steps'][-1],
        d['lower_boundary'][-1],
        d['middle_line'][-1],
        d['upper_boundary'][-1]
    )
    ax.fill_between(
        d['steps'], 
        d['lower_boundary'],
        d['upper_boundary'],
        color='#ccc',
        alpha=alpha
    )
    ax.plot(
        d['steps'],
        d['lower_boundary'],
        linestyle='--',
        color=color
    )
    ax.plot(
        d['steps'],
        d['upper_boundary'],
        linestyle='-.',
        color=color
    )
    ax.plot(
        d['steps'],
        d['middle_line'],
        color=line_color,
        label=label
    )
    ax.set_xlabel('Observations')
    ax.set_ylabel('Accuracy')
    ax.grid(True)


def graph_evolution_comparison(models: pd.DataFrame, hyperparam: str):
    colors = ['#ffbe0b', '#3a86ff', '#3a5a40']
    color_cycle = cycler(color=colors)
    x = None
    for (domain, severity), group in models.groupby(by=['domain', 'severity']):
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        print(domain, severity)
        for color, row in zip(color_cycle, group.sort_values(by=hyperparam).to_dict('records')):
            plot_model_evolution(row, ax, color=color['color'], line_color=color['color'], alpha=0.1, label=row[hyperparam])
        plt.legend()
        plt.show()

In [None]:
number_of_features = 3
learning_window_lengths = (1, 10, 100)
labels_skips = (0, 10, 50)

if GENERATE is True:
    skip = labels_skips[0]
    for win in learning_window_lengths:
        incremental_learning(number_of_features, ONLINE_GRADUAL_PATH, win, skip)

    win = learning_window_lengths[1]
    for skip in labels_skips[1:]:
        incremental_learning(number_of_features, ONLINE_GRADUAL_PATH, win, skip)

In [None]:
win_desired = [1]
skip_desired = [0]
models = load_incremental_distributions(win_desired, skip_desired)
models

In [None]:
for (domain, severity), group in models.groupby(by=['domain', 'severity']):
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    print(domain, severity)
    plot_model_evolution(group.to_dict('records')[0], ax)
    plt.show()

#### Different lengths of tumbling windows

In [None]:
win_desired = learning_window_lengths
skip_desired = [0]
models = load_incremental_distributions(win_desired, skip_desired)
graph_evolution_comparison(models, 'win')

#### Different amount of skipping true labels

In [None]:
win_desired = [10]
skip_desired = labels_skips
models = load_incremental_distributions(win_desired, skip_desired)
graph_evolution_comparison(models, 'skip')