In [1]:
import configparser
import logging
import os
import pathlib
import random
import re
import sys
from itertools import permutations, combinations

import nbformat
import numpy as np
import pandas as pd
from nbformat import NotebookNode

from compare import position_pairs
from logs import add_file_handler
from logs import global_logger as logging
from llm import PRPmodel
from sort import heapsort, quicksort, bubble_sort
from filter import filter_images, filter_output_cells, format_markdown
from execute import load_notebooks

In [47]:
for a in 'corona_pandemie', 'corona_warn_app_analyse', 'reproduktionszahl', 'werbeindustrie':
    exercise_dir = f'./data/shared-dataset-v2/{a}'
    notebooks: dict[str, NotebookNode] = load_notebooks(exercise_dir)

    notebooks = filter_images(notebooks)
    notebooks = filter_output_cells(notebooks)
    print(a, np.mean([len(n.cells) for id, n in notebooks.items()]).round(2))


[38;20m2025-03-26 23:21:23,224 [DEBUG] Loading notebook 'corona_pandemie_066e' with id='066e'[0m
[38;20m2025-03-26 23:21:23,229 [DEBUG] Loading notebook 'corona_pandemie_146f' with id='146f'[0m
[38;20m2025-03-26 23:21:23,235 [DEBUG] Loading notebook 'corona_pandemie_1953' with id='1953'[0m
[38;20m2025-03-26 23:21:23,239 [DEBUG] Loading notebook 'corona_pandemie_21de' with id='21de'[0m
[38;20m2025-03-26 23:21:23,245 [DEBUG] Loading notebook 'corona_pandemie_2281' with id='2281'[0m
[38;20m2025-03-26 23:21:23,249 [DEBUG] Loading notebook 'corona_pandemie_23a6' with id='23a6'[0m
[38;20m2025-03-26 23:21:23,253 [DEBUG] Loading notebook 'corona_pandemie_3669' with id='3669'[0m
[38;20m2025-03-26 23:21:23,257 [DEBUG] Loading notebook 'corona_pandemie_44bd' with id='44bd'[0m
[38;20m2025-03-26 23:21:23,262 [DEBUG] Loading notebook 'corona_pandemie_4c2a' with id='4c2a'[0m
[38;20m2025-03-26 23:21:23,266 [DEBUG] Loading notebook 'corona_pandemie_4c41' with id='4c41'[0m
[38;20m20

corona_pandemie 30.46


[38;20m2025-03-26 23:21:23,854 [DEBUG] Loading notebook 'corona_warn_app_analyse_c938' with id='c938'[0m
[38;20m2025-03-26 23:21:23,862 [DEBUG] Loading notebook 'corona_warn_app_analyse_cb76' with id='cb76'[0m
[38;20m2025-03-26 23:21:23,868 [DEBUG] Loading notebook 'corona_warn_app_analyse_d28c' with id='d28c'[0m
[38;20m2025-03-26 23:21:23,872 [DEBUG] Loading notebook 'corona_warn_app_analyse_eac9' with id='eac9'[0m
[38;20m2025-03-26 23:21:23,880 [DEBUG] Loading notebook 'corona_warn_app_analyse_fb4b' with id='fb4b'[0m
[38;20m2025-03-26 23:21:23,884 [DEBUG] Loading notebook 'corona_warn_app_analyse_fe28' with id='fe28'[0m
[34;20m2025-03-26 23:21:23,891 [INFO] Loaded 35 notebooks.[0m
[38;20m2025-03-26 23:21:24,022 [DEBUG] Loading notebook 'reproduktionszahl_066e' with id='066e'[0m
[38;20m2025-03-26 23:21:24,030 [DEBUG] Loading notebook 'reproduktionszahl_146f' with id='146f'[0m
[38;20m2025-03-26 23:21:24,033 [DEBUG] Loading notebook 'reproduktionszahl_1953' with id='1

corona_warn_app_analyse 12.69


[38;20m2025-03-26 23:21:24,304 [DEBUG] Loading notebook 'werbeindustrie_066e' with id='066e'[0m
[38;20m2025-03-26 23:21:24,310 [DEBUG] Loading notebook 'werbeindustrie_146f' with id='146f'[0m
[38;20m2025-03-26 23:21:24,316 [DEBUG] Loading notebook 'werbeindustrie_1953' with id='1953'[0m
[38;20m2025-03-26 23:21:24,322 [DEBUG] Loading notebook 'werbeindustrie_21de' with id='21de'[0m
[38;20m2025-03-26 23:21:24,326 [DEBUG] Loading notebook 'werbeindustrie_2281' with id='2281'[0m
[38;20m2025-03-26 23:21:24,341 [DEBUG] Loading notebook 'werbeindustrie_23a6' with id='23a6'[0m
[38;20m2025-03-26 23:21:24,349 [DEBUG] Loading notebook 'werbeindustrie_3669' with id='3669'[0m
[38;20m2025-03-26 23:21:24,355 [DEBUG] Loading notebook 'werbeindustrie_44bd' with id='44bd'[0m
[38;20m2025-03-26 23:21:24,361 [DEBUG] Loading notebook 'werbeindustrie_4c2a' with id='4c2a'[0m
[38;20m2025-03-26 23:21:24,365 [DEBUG] Loading notebook 'werbeindustrie_4c41' with id='4c41'[0m
[38;20m2025-03-26 2

reproduktionszahl 12.2
werbeindustrie 6.06


np.float64(12.2)

In [12]:
len(notebooks['4c2a'].cells)

33

In [22]:
for n in notebooks.values():
    print(n.nbformat, n.nbformat_minor)

4 1
4 1
4 1
4 1
4 1
4 4
4 1
4 1
4 1
4 4
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 4


In [19]:
# testdata = pd.DataFrame([[1, 2, 3, 5, 5, 5, 6, 10, 10, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
testdata = pd.DataFrame([[1, 2, 3, 5, 5, 5, 5, 10, 10, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])

In [20]:
testdata = testdata.transpose()
testdata.columns = ['ord', 'seq']
testdata

Unnamed: 0,ord,seq
0,1,1
1,2,2
2,3,3
3,5,4
4,5,5
5,5,6
6,5,7
7,10,8
8,10,9
9,10,10


In [21]:
testdata.corr(method='kendall')

Unnamed: 0,ord,seq
ord,1.0,0.894427
seq,0.894427,1.0


In [34]:
sorted_cp_results = [0, 2, 5, 11, 14, 25, 26, 27, 29, 30, 31, 31, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 38, 38, 39, 39, 40, 40, 42, 42, 47, 47, 50]
sorted_cw_results = [0, 0, 0, 2, 4, 6, 6, 6, 7, 7, 8, 9, 9, 9, 10, 13, 13, 14, 15, 15, 17, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20]
sorted_re_results = [0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 7, 7, 7, 9, 9, 10, 10, 11, 12, 12, 12, 15, 17, 18, 20, 22, 23, 25, 28]
sorted_we_results = [0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 10, 10]
best_possible_sorting = list(range(35))

In [42]:
testdata = pd.DataFrame([sorted_cp_results, sorted_cw_results, sorted_re_results, sorted_we_results, best_possible_sorting])
testdata = testdata.transpose()
testdata.columns = ['cp', 'cw', 're', 'we', 'best']
testdata

Unnamed: 0,cp,cw,re,we,best
0,0,0,0,0,0
1,2,0,0,0,1
2,5,0,0,0,2
3,11,2,0,0,3
4,14,4,0,0,4
5,25,6,4,0,5
6,26,6,4,1,6
7,27,6,4,2,7
8,29,7,4,2,8
9,30,7,4,3,9


In [44]:
testdata.corr(method='kendall').round(6)

Unnamed: 0,cp,cw,re,we,best
cp,1.0,0.947645,0.948522,0.953961,0.983904
cw,0.947645,1.0,0.919519,0.932722,0.953543
re,0.948522,0.919519,1.0,0.940966,0.952661
we,0.953961,0.932722,0.940966,1.0,0.956183
best,0.983904,0.953543,0.952661,0.956183,1.0
