In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.utils.tree import QPLTree
from src.utils.schema import DBSchema, NoiseStrategy
from datasets import load_dataset

In [None]:
schemas = DBSchema.from_db_schemas_file()

In [None]:
completer_ds = load_dataset("d4nieldev/qpl-completer-ds")
original = []
noise_levels = [0.1*i for i in range(11)]
linked = [[] for _ in noise_levels]
for split in completer_ds:
    data = completer_ds[split]
    for d in data:
        db_id = d['db_id']
        schema = schemas[db_id]
        original.append(schema)
        qpl_lines = []
        if d['prefix_qpl']:
            qpl_lines.extend([line.split(' ; ')[0] for line in d['prefix_qpl'].split('\n')])
        qpl_lines.append(d['qpl_line'])
        qpl_tree = QPLTree.from_qpl_lines(qpl_lines)
        for i, noise in enumerate(noise_levels):
            d = {}
            for noise_strategy in NoiseStrategy:
                d[noise_strategy.value] = schema.link(qpl_tree.get_schema_items(), noise=noise, noise_strategy=noise_strategy)
            linked[i].append(d)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(1, len(NoiseStrategy), figsize=(15, 5))
for i, strategy in enumerate(NoiseStrategy):
    ax = axes[i]
    means = []
    stds = []
    for j, noise in enumerate(noise_levels):
        num_cols = [len(d[strategy.value]) / len(original[k]) for k, d in enumerate(linked[j])]
        means.append(np.mean(num_cols))
        stds.append(np.std(num_cols))
    ax.errorbar(noise_levels, means, yerr=stds, fmt='-o')
    ax.set_title(f'Strategy: {strategy.value}')
    ax.set_xlabel('Noise Level')
    ax.set_ylabel('Avg. Included Schema Items out of Total')

plt.suptitle('Schema Items Included vs Noise Level for Different Strategies')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(1, len(NoiseStrategy), figsize=(15, 5))
for i, strategy in enumerate(NoiseStrategy):
    ax = axes[i]
    means = []
    stds = []
    for j, noise in enumerate(noise_levels):
        num_cols = [len(d[strategy.value].m_schema()) / len(original[k].m_schema()) for k, d in enumerate(linked[j])]
        means.append(np.mean(num_cols))
        stds.append(np.std(num_cols))
    ax.errorbar(noise_levels, means, yerr=stds, fmt='-o')
    ax.set_title(f'Strategy: {strategy.value}')
    ax.set_xlabel('Noise Level')
    ax.set_ylabel('M-Schema Length (characters)')

plt.suptitle('M-Schema Length vs Noise Level for Different Strategies')
plt.tight_layout()
plt.show()