In [1]:
import os
import sys
import pandas as pd

sys.path.append('/home/fboehning/fboehning')

In [2]:
# IMPORT: labtools
import labtools
from labtools.labhandler import *
from labtools.datahandle import *
from labtools.display_utils import *
from labtools.directory_utils import *
from labtools.dictionary_utils import *
from labtools.experiment_utils import *

set_display_for_dataframe()
pd.set_option('display.max_rows', 7)

In [None]:
# IMPORT: beesup_llm
import beesup_llm.llm; from beesup_llm.llm import *
import beesup_llm.extraction; from beesup_llm.extraction import *
import beesup_llm.extraction.pipeline; from beesup_llm.extraction.pipeline import *
import beesup_llm.extraction.experiment; from beesup_llm.extraction.experiment import *

In [4]:
import importlib
prefixpaths=['labtools','beesup_llm']

def get_modules_with_prefixpath(prefixpaths):
    if isinstance(prefixpaths, str):
        prefixpaths = [prefixpaths]

    modules=dict()
    all_modules = sys.modules.copy()
    for prefixpath in prefixpaths:
        for module_path, module in all_modules.items():
            if module_path.startswith(prefixpath):
                modules[module_path] = module

    return modules

def reimport(prefixpaths=prefixpaths):
    for module_path, module in get_modules_with_prefixpath(prefixpaths).items():
        importlib.reload(sys.modules[module_path])
        globals().update({name: getattr(module, name) for name in dir(module) if not name.startswith('_')})

LOGGING_FORMAT='%(asctime)s - %(filename)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s'

import logging
logging.basicConfig(
    level=logging.INFO,
    format=LOGGING_FORMAT)

def set_info(prefixpaths=prefixpaths):
    for module_path in get_modules_with_prefixpath(prefixpaths).keys():
        logger = logging.getLogger(module_path)
        logger.setLevel(logging.INFO)
        logging.getLogger().setLevel(logging.INFO)

def set_debug(prefixpaths=prefixpaths):
    for module_path in get_modules_with_prefixpath(prefixpaths).keys():
        logger = logging.getLogger(module_path)
        logger.setLevel(logging.DEBUG)
        logging.getLogger().setLevel(logging.DEBUG)

set_info() 

In [5]:
#LOAD LLM
llm_pipe=LlamaPipeline()
llm_pipe.prepare_inference()
model=copy.deepcopy(llm_pipe.get_model())

2025-03-17 08:47:28,030 - llm.py - labtools.labhandler - load_model - INFO - Loading model meta-llama/Meta-Llama-3.1-8B-Instruct
2025-03-17 08:47:28,926 - modeling.py - accelerate.utils.modeling - get_balanced_memory - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
#LOAD DATA
from datasets import load_from_disk

dataset_ds=load_from_disk('../beede_llm/data/beede_llm_dataset')
dataset_ds

train_df=pd.DataFrame(dataset_ds['train'])
train_df['split']='train'

eval_df=pd.DataFrame(dataset_ds['test'])
eval_df['split']='eval'

dataset_df=pd.concat([train_df,eval_df])
dataset_df

Unnamed: 0,report_passage,report_scheme,gold_completion,source,is_real,split
0,Beitrag zur Hautflüglerfauna von Bran-\ndenbur...,paragraphs,"```json\n{\n ""meta_scientific_name"": ""Andrena...",https://www.zobodat.at/pdf/Maerkische-Ent-Nach...,True,train
1,"Christoph Saure\n\nHoplitis villosa (Schenck, ...",paragraphs,"```json\n{\n ""meta_location"": ""Berlin, Dahlem...",https://www.zobodat.at/pdf/Maerkische-Ent-Nach...,True,train
2,Beitrag zur Hautflüglerfauna von Bran-\ndenbur...,paragraphs,"```json\n{\n ""meta_scientific_name"": ""Andrena...",https://www.zobodat.at/pdf/Maerkische-Ent-Nach...,True,train
...,...,...,...,...,...,...
27,Für das Stadtgebiet Bielefeld nördlich des Te...,list_table,"```json\n{\n ""meta_location"": ""Nordrhein-West...",https://www.zobodat.at/pdf/Ber-Natwiss-Ver-Bie...,True,eval
28,Für das Stadtgebiet Bielefeld nördlich des Te...,list_table,"```json\n{\n ""meta_location"": ""Nordrhein-West...",https://www.zobodat.at/pdf/Ber-Natwiss-Ver-Bie...,True,eval
29,Die Insekten des Naturschutzparkes\nder Lünebu...,list_table,"```json\n{\n ""meta_location"": ""Lüneburger Hei...",https://www.zobodat.at/pdf/Abh-natwiss-Verein-...,True,eval


In [None]:
datah=Datahandle(
    label='beede_llm_dataset',
    data_df=dataset_df,
)
datah.config
#datah.save()

In [7]:
reimport()

In [14]:
getattr(None, 'hello', None)

In [5]:
experiment=ExtractionExperiment(
    llm_pipe=LlamaPipeline(),
    data_df=Datahandle(1),

    extraction_pipe=ExtractionPipeline(
        use_few_shots = True,
        use_extraction_prompt = False
    ),

    num_train_epochs=10,
    save_strategy='epoch',
    save_total_limit=1,

    do_eval_base_model=True,
    do_eval_lora_model=True,
    do_train=True
    )

experiment.config
experiment.save()

In [9]:
reimport()

In [29]:
set_info()

In [6]:
experiments_df=ExtractionExperiment().get_overview(keypaths=['done','num_train_epochs','do_eval_base_model','do_train','_path'])
experiments_df

Unnamed: 0,class_parts,id,label,done,num_train_epochs,do_eval_base_model,do_train,_path
0,"[FinetuningExperiment, ExtractionExperiment]",1,ExtractionExperiment,False,10,True,True,/home/fboehning/fboehning/extraction_lab/Finet...


In [7]:
generate_multirun(experiments_df)


Run the following command to start the multirun:

	conda activate beesup; python /home/fboehning/fboehning/labtools/experiment_runner.py /home/fboehning/fboehning/extraction_lab/FinetuningExperiment/ExtractionExperiment/0001_ExtractionExperiment/multirun.yaml


{'experiment_paths': ['/home/fboehning/fboehning/extraction_lab/FinetuningExperiment/ExtractionExperiment/0001_ExtractionExperiment']}

In [7]:
experiment=ExtractionExperiment(-1, model=model)
# experiment.do_train=False
# experiment.do_eval_base_model=False
# experiment.do_eval_lora_model=False
experiment.config

{'id': 1,
 'label': 'ExtractionExperiment',
 'lab_name': 'extraction_lab',
 'class_parts': ['FinetuningExperiment', 'ExtractionExperiment'],
 'module_path': 'beesup_llm.extraction.extraction_experiment',
 'done': False,
 'do_eval_base_model': True,
 'do_eval_lora_model': True,
 'do_train': True,
 'lora_config': {'r': 32,
  'lora_alpha': 3,
  'use_rslora': True,
  'target_modules': 'all-linear',
  'lora_dropout': 0.05,
  'bias': 'none',
  'task_type': 'CAUSAL_L'},
 'sft_config': {'num_train_epochs': 2,
  'output_dir': '/home/fboehning/fboehning/extraction_lab/FinetuningExperiment/ExtractionExperiment/0001_ExtractionExperiment',
  'auto_find_batch_size': True,
  'per_device_train_batch_size': 8,
  'gradient_accumulation_steps': 1,
  'learning_rate': 0.0002,
  'optim': 'paged_adamw_8bit',
  'save_strategy': 'epoch',
  'save_total_limit': 1,
  'eval_strategy': 'no',
  'logging_strategy': 'steps',
  'logging_steps': 1,
  'logging_first_step': True,
  'do_train': True,
  'do_eval': False,
  

In [8]:
experiment.do_eval_base_model

True

In [9]:
experiment.run()

2025-03-17 08:48:44,794 - finetuning_experiment.py - labtools.labhandler - run_entry - INFO - Run Experiment
2025-03-17 08:48:49,024 - finetuning_experiment.py - labtools.labhandler - evaluate_base_model - INFO - Evaluating base model
2025-03-17 08:48:49,025 - finetuning_experiment.py - beesup_llm.finetuning_experiment - on_epoch_end - INFO - Start Evaluation 0-0 
2025-03-17 08:48:49,027 - extraction_experiment.py - labtools.labhandler - on_epoch_end - INFO - 0-0  kwargs={'evaluators': [<beesup_llm.extraction.extraction_experiment.ExtractionEvaluator object at 0x7fca98ecbfb0>]}
2025-03-17 08:49:22,524 - finetuning_experiment.py - labtools.labhandler - load_trainer - INFO - Loading trainer
2025-03-17 08:49:22,526 - finetuning_experiment.py - labtools.labhandler - get_lora_model - INFO - Loading Lora model


trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338249554642545


2025-03-17 08:49:38,792 - finetuning_experiment.py - labtools.labhandler - train - INFO - Start Training


{'loss': 0.052, 'grad_norm': 0.10685398429632187, 'learning_rate': 0.00015000000000000001, 'epoch': 0.5}


2025-03-17 08:50:31,572 - finetuning_experiment.py - beesup_llm.finetuning_experiment - on_epoch_end - INFO - Start Evaluation 1-2 
2025-03-17 08:50:31,580 - extraction_experiment.py - labtools.labhandler - on_epoch_end - INFO - 1-2  kwargs={'processing_class': PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3.1-8B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken(

{'loss': 0.0346, 'grad_norm': 0.10775291174650192, 'learning_rate': 0.0001, 'epoch': 1.0}


The model 'PeftModel' is not supported for . Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausa

{'loss': 0.0185, 'grad_norm': 0.045758046209812164, 'learning_rate': 5e-05, 'epoch': 1.5}
{'loss': 0.0127, 'grad_norm': 0.05856623128056526, 'learning_rate': 0.0, 'epoch': 2.0}


2025-03-17 08:51:54,744 - finetuning_experiment.py - beesup_llm.finetuning_experiment - on_epoch_end - INFO - Start Evaluation 2-4 
2025-03-17 08:51:54,751 - extraction_experiment.py - labtools.labhandler - on_epoch_end - INFO - 2-4  kwargs={'processing_class': PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3.1-8B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken(

{'train_runtime': 164.6288, 'train_samples_per_second': 0.049, 'train_steps_per_second': 0.024, 'train_loss': 0.02943698945455253, 'epoch': 2.0}


In [37]:
get_timestamp()

'2025-03-14_14-56-16'

In [38]:
experiment.timestamp_start=get_timestamp()
experiment.save_config()

In [36]:
hasattr(experiment,'save_config')

True

In [25]:
experiments_df._path.to_list()

['/home/fboehning/fboehning/extraction_lab/FinetuningExperiment/ExtractionExperiment/0001_ExtractionExperiment']

In [18]:
os.getcwd()

'/home/fboehning/fboehning/extraction_lab'

In [17]:
experiments_df.iloc[0]._path

'/home/fboehning/fboehning/extraction_lab/FinetuningExperiment/ExtractionExperiment/0001_ExtractionExperiment'