In [1]:
%load_ext autoreload
import json
import sys
import pathlib
from matplotlib import pylab as plt
import numpy as np
import pandas as pd
import os
import sys
import seaborn as sns

cp_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/corona_pandemie').rglob('*.ipynb'):
    with open(file) as f:
        cp_data[file.stem.removeprefix('corona_pandemie_')] = json.load(f)

cw_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/corona_warn_app_analyse').rglob('*.ipynb'):
    with open(file) as f:
        cw_data[file.stem.removeprefix('corona_warn_app_analyse_')] = json.load(f)

rp_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/reproduktionszahl').rglob('*.ipynb'):
    with open(file) as f:
        rp_data[file.stem.removeprefix('reproduktionszahl_')] = json.load(f)

wr_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/werbeindustrie').rglob('*.ipynb'):
    with open(file) as f:
        wr_data[file.stem.removeprefix('werbeindustrie_')] = json.load(f)

In [2]:
cp_data.keys()

dict_keys(['066e', '146f', '1953', '21de', '2281', '23a6', '3669', '44bd', '4c2a', '4c41', '4ca2', '63e1', '6b4e', '6c5b', '6f62', '7cc7', '88a6', '8ac3', '8c43', '8d45', '8eaf', '9eb2', 'a044', 'abd2', 'ad47', 'b14b', 'b56b', 'baa4', 'c926', 'c938', 'cb76', 'd28c', 'eac9', 'fb4b', 'fe28'])

In [3]:
def filter_images(data):
    """Sets the image data to None in the output of code cells."""
    for k, v in data.items():
        for cell in v['cells']:
            if cell['cell_type'] == 'code' and 'outputs' in cell:
                for output in cell['outputs']:
                    if 'data' in output and 'image/png' in output['data']:
                        # remove image from data
                        output['data']['image/png'] = None

filter_images(cp_data)
filter_images(cw_data)
filter_images(rp_data)
filter_images(wr_data)

In [4]:
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv

load_dotenv()
HUGGINGFACE = os.getenv("HUGGINGFACE")

def calc_tokens(text: str, max_length: int = 0) -> int:
    tokenizer = AutoTokenizer.from_pretrained(
        #"deepseek-ai/DeepSeek-R1",
        "meta-llama/Llama-3.3-70B-Instruct",
        token=HUGGINGFACE)

    tokens = []
    if max_length != 0 and len(text) > max_length:
        for t in [text[i:i + max_length] for i in range(0, len(text), max_length)]:
            tokens.extend(tokenizer.encode(t, add_special_tokens=False))
    else:
        tokens.extend(tokenizer.encode(text, add_special_tokens=False))  # add_special_tokens=True, falls du Sondertokens (z.B. BOS/EOS) berücksichtigen möchtest

    return len(tokens)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:
tokens = {}

for k, v in cp_data.items():
    tokens[f"cp_{k}"] = calc_tokens(str(v))

for k, v in cw_data.items():
    tokens[f"cw_{k}"] = calc_tokens(str(v))

for k, v in rp_data.items():
    tokens[f"rp_{k}"] = calc_tokens(str(v))

for k, v in wr_data.items():
    tokens[f"wr_{k}"] = calc_tokens(str(v))

print(sum(tokens.values())/len(tokens))

4378.15


In [6]:
sum(tokens.values())

612941

In [7]:
%autoreload 2
from llm import send_prompt

send_prompt(str(cp_data['4c41']), str(cp_data['9eb2']))

{'id': 'chatcmpl-470f4e7f0e7e419885ae1bc1cfd5a216',
 'object': 'chat.completion',
 'created': 1738915687,
 'model': 'meta-llama-3-70b-instruct',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': 'Notebook A', 'tool_calls': []},
   'logprobs': None,
   'finish_reason': 'stop',
   'stop_reason': None}],
 'usage': {'prompt_tokens': 27389,
  'total_tokens': 27393,
  'completion_tokens': 4,
  'prompt_tokens_details': None},
 'prompt_logprobs': None}

In [8]:
def sort_function(d1, d2, retry=4):
    if retry == 0:
        print("Maximale Anzahl an Versuchen erreicht.")
        return 0
    response = send_prompt(str(cp_data[d1]), str(cp_data[d2]))
    if response:
        if "choices" in response and len(response["choices"]) > 0:
            msg = response["choices"][0]["message"]["content"].strip()
            if msg == "Notebook A":
                print(f"{d1} > {d2}")
                return -1
            elif msg == "Notebook B":
                print(f"{d1} < {d2}")
                return 1
            else:
                print(f"{d1} == {d2}")
                return 0
        else:
            print("Unerwartetes Antwortformat:", response)

    print(f"Retrying... {retry-1}")
    return sort_function(d1, d2, retry-1)

In [9]:
cp_data.keys()

dict_keys(['066e', '146f', '1953', '21de', '2281', '23a6', '3669', '44bd', '4c2a', '4c41', '4ca2', '63e1', '6b4e', '6c5b', '6f62', '7cc7', '88a6', '8ac3', '8c43', '8d45', '8eaf', '9eb2', 'a044', 'abd2', 'ad47', 'b14b', 'b56b', 'baa4', 'c926', 'c938', 'cb76', 'd28c', 'eac9', 'fb4b', 'fe28'])

In [10]:
%autoreload 2
from sort import quicksort
import random

# Ranking: 4c41, fe28, 4c2a, 63e1, 21de, 6c5b, fb4b, 146f
#check_idxs = ['4c41', 'fe28', '4c2a', '63e1', '21de', '6c5b', 'fb4b', '146f']
check_idxs = list(cp_data.keys())
# shuffle the list
random.shuffle(check_idxs)

cp_data_sorted = quicksort(check_idxs, sort_function)

c926 > 4ca2
44bd > c926
c926 > 4ca2
c938 > c926
88a6 > c926
ad47 > c926
baa4 > c926
fb4b < c926
d28c > c926
6c5b > c926
8c43 > c926
146f < c926
a044 < c926
8d45 > c926
abd2 < c926
066e < c926
6f62 < c926
2281 > c926
b56b > c926
7cc7 > c926
21de < c926
eac9 > c926
6b4e > c926
b14b > c926
9eb2 < c926
4c41 > c926
63e1 > c926
fe28 > c926
cb76 > c926
23a6 > c926
4c2a < c926
3669 > c926
8eaf > c926
1953 < c926
8ac3 < c926
c926 > c926
c926 > 8ac3
c926 > 1953
c926 > 8eaf
c926 > 3669
c926 > 4c2a
c926 > 23a6
c926 > cb76
c926 > fe28
c926 > 63e1
c926 > 4c41
c926 > 9eb2
c926 > b14b
c926 > 6b4e
c926 > eac9
c926 > 21de
c926 > 7cc7
c926 > b56b
c926 > 2281
c926 > 6f62
c926 > 066e
c926 > abd2
c926 > 8d45
c926 > a044
c926 > 146f
c926 > 8c43
c926 > 6c5b
c926 > d28c
c926 > fb4b
c926 > baa4
c926 > ad47
c926 > 88a6
c926 > c938
b56b > 44bd
4ca2 > b56b
b56b > 44bd
c938 > b56b
88a6 > b56b
ad47 > b56b
baa4 > b56b
fb4b < b56b
d28c > b56b
6c5b > b56b
8c43 > b56b
146f < b56b
a044 > b56b
8d45 > b56b
abd2 > b56b
066e

In [11]:
cp_data_sorted

['d28c',
 'fb4b',
 'baa4',
 'ad47',
 '88a6',
 'c938',
 '4ca2',
 '2281',
 '146f',
 '8c43',
 '63e1',
 '8d45',
 '6b4e',
 '3669',
 '23a6',
 '9eb2',
 'b14b',
 '8eaf',
 '6f62',
 '066e',
 'abd2',
 '44bd',
 '4c41',
 'a044',
 'fe28',
 '1953',
 'cb76',
 '7cc7',
 '4c2a',
 '21de',
 '6c5b',
 'eac9',
 '8ac3',
 'b56b',
 'c926']

In [12]:
corona_pandemie_points = pd.read_csv('data/shared-dataset/corona_pandemie_points.csv')

In [29]:
dfm = corona_pandemie_points.loc[:, ['id', 'rank']]
dfm['quick_sorted_idx'] = dfm['id'].map(lambda x: cp_data_sorted.index(x))
dfm.set_index('id', inplace=True)
kendall = dfm.corr(method='kendall').iloc[0, 1]
print("Kendall:   ", kendall)

Kendall:    -0.06149400462680908


In [30]:
dfm

Unnamed: 0_level_0,rank,quick_sorted_idx
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4c41,1,22
eac9,2,31
d28c,2,0
6b4e,4,12
1953,4,25
c926,6,34
8eaf,6,17
8d45,8,11
cb76,8,26
fe28,10,24


In [32]:
%autoreload 2
from sort import heapsort
import random

check_idxs = list(cp_data.keys())
random.shuffle(check_idxs)

cp_data_sorted = heapsort(check_idxs, sort_function)

4ca2 > b14b
b14b > fb4b
b56b > c926
c926 > 066e
7cc7 > 8eaf
8eaf < 44bd
8ac3 < 4c41
4c41 > ad47
2281 > 9eb2
9eb2 < baa4
c938 > d28c
d28c > a044
146f < 21de
21de < 6f62
abd2 < eac9
eac9 > 8d45
8c43 > 1953
1953 < 23a6
066e > fb4b
4ca2 > b14b
b14b > 3669
fb4b < 3669
ad47 < 44bd
7cc7 > 8eaf
8eaf > 4c2a
44bd > 4c2a
a044 < baa4
2281 > 9eb2
9eb2 < cb76
baa4 > cb76
8d45 > 6f62
146f < 21de
21de < 63e1
6f62 < 63e1
3669 > 23a6
8c43 > 1953
1953 < fe28
23a6 > fe28
cb76 > 4c2a
ad47 < 44bd
7cc7 < 8eaf
Error in request: 400 
Retrying... 3
Error in request: 400 
Retrying... 2
Error in request: 400 
Retrying... 1
Error in request: 400 
Retrying... 0
Maximale Anzahl an Versuchen erreicht.
fe28 > 63e1
8d45 > 6f62
146f < 21de
21de < 88a6
6f62 < 88a6
63e1 > 88a6
88a6 < 4c2a
cb76 > 44bd
ad47 > 8eaf
Error in request: 400 
Retrying... 3
Error in request: 400 
Retrying... 2
Error in request: 400 
Retrying... 1
Error in request: 400 
Retrying... 0
Maximale Anzahl an Versuchen erreicht.
7cc7 > 6b4e
8eaf > 6b4e
Ra

KeyboardInterrupt: 

In [None]:
# qwen2.5-coder-32b-instruct