In [60]:
%load_ext autoreload
import json
import sys
import pathlib
from matplotlib import pylab as plt
import numpy as np
import pandas as pd
import os
import sys
import seaborn as sns

cp_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/corona_pandemie').rglob('*.ipynb'):
    with open(file) as f:
        cp_data[file.stem.removeprefix('corona_pandemie_')] = json.load(f)

cw_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/corona_warn_app_analyse').rglob('*.ipynb'):
    with open(file) as f:
        cw_data[file.stem.removeprefix('corona_warn_app_analyse_')] = json.load(f)

rp_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/reproduktionszahl').rglob('*.ipynb'):
    with open(file) as f:
        rp_data[file.stem.removeprefix('reproduktionszahl_')] = json.load(f)

wr_data: dict[str, dict] = {}
for file in pathlib.Path('data/shared-dataset/werbeindustrie').rglob('*.ipynb'):
    with open(file) as f:
        wr_data[file.stem.removeprefix('werbeindustrie_')] = json.load(f)

In [61]:
cp_data.keys()

dict_keys(['066e', '146f', '1953', '21de', '2281', '23a6', '3669', '44bd', '4c2a', '4c41', '4ca2', '63e1', '6b4e', '6c5b', '6f62', '7cc7', '88a6', '8ac3', '8c43', '8d45', '8eaf', '9eb2', 'a044', 'abd2', 'ad47', 'b14b', 'b56b', 'baa4', 'c926', 'c938', 'cb76', 'd28c', 'eac9', 'fb4b', 'fe28'])

In [76]:
def filter_images(data):
    """Sets the image data to None in the output of code cells."""
    for k, v in data.items():
        for cell in v['cells']:
            if cell['cell_type'] == 'code' and 'outputs' in cell:
                for output in cell['outputs']:
                    if 'data' in output and 'image/png' in output['data']:
                        # remove image from data
                        output['data']['image/png'] = None

filter_images(cp_data)
filter_images(cw_data)
filter_images(rp_data)
filter_images(wr_data)

In [100]:
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv

load_dotenv()
HUGGINGFACE = os.getenv("HUGGINGFACE")

def calc_tokens(text: str, max_length: int = 0) -> int:
    tokenizer = AutoTokenizer.from_pretrained(
        #"deepseek-ai/DeepSeek-R1",
        "meta-llama/Llama-3.3-70B-Instruct",
        token=HUGGINGFACE)

    tokens = []
    if max_length != 0 and len(text) > max_length:
        for t in [text[i:i + max_length] for i in range(0, len(text), max_length)]:
            tokens.extend(tokenizer.encode(t, add_special_tokens=False))
    else:
        tokens.extend(tokenizer.encode(text, add_special_tokens=False))  # add_special_tokens=True, falls du Sondertokens (z.B. BOS/EOS) berücksichtigen möchtest

    return len(tokens)

In [103]:
tokens = {}

for k, v in cp_data.items():
    tokens[f"cp_{k}"] = calc_tokens(str(v))

for k, v in cw_data.items():
    tokens[f"cw_{k}"] = calc_tokens(str(v))

for k, v in rp_data.items():
    tokens[f"rp_{k}"] = calc_tokens(str(v))

for k, v in wr_data.items():
    tokens[f"wr_{k}"] = calc_tokens(str(v))

print(sum(tokens.values())/len(tokens))

4378.15


In [104]:
sum(tokens.values())

612941

In [116]:
%autoreload 2
from llm import send_prompt

send_prompt(str(cp_data['4c41']), str(cp_data['9eb2']))

{'id': 'chatcmpl-d628aa494ef548fdab040dd8a935fe01',
 'object': 'chat.completion',
 'created': 1738882075,
 'model': 'meta-llama-3-70b-instruct',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': 'Notebook A', 'tool_calls': []},
   'logprobs': None,
   'finish_reason': 'stop',
   'stop_reason': None}],
 'usage': {'prompt_tokens': 27358,
  'total_tokens': 27362,
  'completion_tokens': 4,
  'prompt_tokens_details': None},
 'prompt_logprobs': None}

In [127]:
def sort_function(d1, d2, retry=4):
    if retry == 0:
        print("Maximale Anzahl an Versuchen erreicht.")
        return 0
    response = send_prompt(str(cp_data[d1]), str(cp_data[d2]))
    if response:
        if "choices" in response and len(response["choices"]) > 0:
            msg = response["choices"][0]["message"]["content"].strip()
            print(msg)
            if msg == "Notebook A":
                return -1
            elif msg == "Notebook B":
                return 1
            else:
                return 0
        else:
            print("Unerwartetes Antwortformat:", response)

    print(f"Retrying... {retry-1}")
    return sort_function(d1, d2, retry-1)

In [118]:
cp_data.keys()

dict_keys(['066e', '146f', '1953', '21de', '2281', '23a6', '3669', '44bd', '4c2a', '4c41', '4ca2', '63e1', '6b4e', '6c5b', '6f62', '7cc7', '88a6', '8ac3', '8c43', '8d45', '8eaf', '9eb2', 'a044', 'abd2', 'ad47', 'b14b', 'b56b', 'baa4', 'c926', 'c938', 'cb76', 'd28c', 'eac9', 'fb4b', 'fe28'])

In [128]:
%autoreload 2
from sort import quicksort
import random

# Ranking: 4c41, fe28, 4c2a, 63e1, 21de, 6c5b, fb4b, 146f
#check_idxs = ['4c41', 'fe28', '4c2a', '63e1', '21de', '6c5b', 'fb4b', '146f']
check_idxs = list(cp_data.keys())
# shuffle the list
random.shuffle(check_idxs)

cp_data_sorted = quicksort(check_idxs, sort_function)

Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook B
Notebook A
Notebook A
Notebook B
Notebook A
Notebook A
Notebook B
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook B
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook B
Notebook A
Notebook B
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook B
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A
Notebook A

KeyboardInterrupt: 

In [129]:
cp_data_sorted

['146f', '4c41', '21de', '63e1', '4c2a', 'fb4b', 'fe28', '6c5b']