In [1]:
!pip install pandas
!pip install openpyxl
!pip install langchain_text_splitters
!pip install -U transformers
!pip install peft

[0m

In [None]:
from huggingface_hub import login
import tqdm
login()

In [3]:
import pandas as pd
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return text

In [5]:
def update_pipeline(model_name):
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
    from peft import PeftModel
    if "lora" in model_name:
        base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
        lora_model_path = model_name.split("-")[1]

        # Load base model and tokenizer
        model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)

        # Load LoRA adapter
        model = PeftModel.from_pretrained(model, lora_model_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None: # Padding-Token
            tokenizer.pad_token = tokenizer.eos_token
        # Modell laden
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    # Pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,  
        max_new_tokens = 200, # max. generierte Antwort
        pad_token_id=tokenizer.eos_token_id,
    )
    return pipe

In [6]:
def question_answer(pipe, text, question):
    prompt = f"Beantworte kurz die Frage \n{question} basierend auf folgendem Text mit ja oder nein: \n{text}" 
    result = pipe(prompt)[0]['generated_text']
    return result[len(prompt):].strip()

In [7]:
def generate(file_path, model_name, pipeline, question):
    text = read_text(file_path)
    pipe = update_pipeline(model_name)
    summary = summarize(text, pipe)
    answer = question_answer(pipe, text, question)
    evaluation = evaluate_answer(answer)
    return summary, answer, evaluation

In [8]:
data = pd.read_excel("Sample_combined.xlsx")
#data = data[:2]

In [9]:
file_path = "Gerichtsurteile/BAG/jb-KARE600028845.txt"
model_name = "mistralai/Mistral-Small-24B-Instruct-2501"
question1 = "Werden in dem Urteil Zahlungsstreitigkeiten behandelt?"
question2 = "Behandelt das Urteil den Arbeitsschutz?"
question3 = "Wird eine Stafttat gegen die körperliche Unversehrheit betrachtet?"
question4 = "Ist der Streitgegenstand ein Patentsachverhalt?"

In [10]:
# Auswertung der Antwort
def evaluate_answer(result):
    result_lower = result.lower()
    if "ja" in result_lower and "nein" not in result_lower:
        return 1
    if "nein" in result_lower and "ja" not in result_lower:
        return 0
    else:
        return 2

#generate(file_path, model_name, pipeline, question1)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

questions = ["Werden in dem Urteil Zahlungsstreitigkeiten behandelt?",
             "Behandelt das Urteil den Arbeitsschutz?",
             "Wird eine Stafttat gegen die körperliche Unversehrheit betrachtet?",
             "Ist der Streitgegenstand ein Patentsachverhalt?"]

model_list = ["mistralai/Mistral-Small-24B-Instruct-2501","mistralai/Mistral-7B-Instruct-v0.2","lora-results_2m","lora-results_4m"]

results_dict = dict()
for model in model_list:
    #pipeline festlegen
    pipeline = update_pipeline(model)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model_results = list()
    
    for i in data.itertuples():
        try:
            content = read_text("Sample_1/"+i[2])
        except FileNotFoundError:
            content = read_text("Sample_2/"+i[2])

        token_count = len(tokenizer.encode(content))
            
        max_tokens = 4000
        if token_count > max_tokens:
            text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n","\n","."],chunk_size = 2000,chunk_overlap = 0)
            split_texts = text_splitter.split_text(content)
            
            results = []
            #print("here is split text")
            for question in questions:
                split_results = []
                for part in split_texts:
                    answer = question_answer(pipeline,part,question)

                    if "yes" in answer.lower()[:6]+answer.lower()[-6:] or "ja" in answer.lower()[:6]+answer.lower()[-6:]:
                        split_results.append(1)
                    elif "no" in answer.lower()[:6]+answer.lower()[-6:] or "nein" in answer.lower()[:6]+answer.lower()[-6:]:
                        split_results.append(0)
                    else:
                        split_results.append(2)
                
                if 1 in split_results:
                    results.append(1)
                elif 0 in split_results:
                    results.append(0)
                else:
                    results.append(2)
            #print(results)


        else:
            results = []
            for question in questions:
                answer = question_answer(pipeline,content,question)
                
                if "yes" in answer.lower()[:6]+answer.lower()[-6:] or "ja" in answer.lower()[:6]+answer.lower()[-6:]:
                    results.append(1)
                elif "no" in answer.lower()[:6]+answer.lower()[-6:] or "nein" in answer.lower()[:6]+answer.lower()[-6:]:
                    results.append(0)
                else:
                    results.append(2)
        
        model_results.append(results)
        print(i[0])
                
                
    results_dict[model] = model_results.copy()            
    del pipeline 
    time.sleep(15)
    #pipeline löschen
    #15s warten -- speicher freimachen
    

2025-02-15 08:35:02.391311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Device set to use cuda:0


0
1


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [None]:
file_names = {"lora-results_2m" : "2m_non_summarized.csv",
              "lora-results_4m" : "4m_non_summarized.csv",
              "mistralai/Mistral-7B-Instruct-v0.2" : "7b-instruct_no_summary.csv",
              "mistralai/Mistral-Small-24B-Instruct-2501": "24b-instruct_no_summary.csv"}

for i in results_dict.keys():
    pd.DataFrame(results_dict[i]).to_csv("no_summary_results/"+file_names[i])

In [None]:
i

In [None]:
print(results_dict)

In [None]:
answer

In [None]:
#pd.DataFrame(results_dicts["mistralai/Mistral-Small-24B-Instruct-2501"]).to_csv("27bMistral_2.csv")

In [None]:
if "yes" or "ja" in answer.lower()[0:15]:
    print("yes")

In [None]:
#pd.DataFrame(results_dict['mistralai/Mistral-Small-24B-Instruct-2501']).to_csv("27b_results_2.csv")

In [None]:
results_dict

In [None]:
"testa"[-1:]