In [None]:
#!pip install accelerate transformers datasets scikit-learn flash-attn

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import glob


In [None]:
# Pfad zu deinen Parquet-Dateien
parquet_files_path = "./your_router_data_path/*.parquet"

# Alle Parquet-Dateien laden und konkatinieren
parquet_files = glob.glob(parquet_files_path)
data_frames = [pd.read_parquet(file) for file in parquet_files]
data = pd.concat(data_frames, ignore_index=True)

# Überprüfen, ob die Spaltennamen korrekt sind
print(data.columns)


In [None]:
split = 0.90
RANDOM_SEED = 42

df_train, df_test_unfixed = train_test_split(data, test_size=(1-split), random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test_unfixed, test_size=0.5, random_state=RANDOM_SEED)

df_train = df_train.reset_index(drop=True)
df_test_unfixed = df_test_unfixed.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
tokenizer.pad_token = tokenizer.eos_token

# Load sequence classification model and adjust head
model = AutoModelForSequenceClassification.from_pretrained("Qwen/Qwen1.5-0.5B", device_map="auto", num_labels=5)
model.config.pad_token_id = tokenizer.pad_token_id


In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
# Prepare datasets for training
ds_train = Dataset.from_pandas(df_train)
ds_eval = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

# Apply preprocessing to the datasets
ds_train = ds_train.map(preprocess_function, batched=True)
ds_eval = ds_eval.map(preprocess_function, batched=True)
ds_test = ds_test.map(preprocess_function, batched=True)


In [None]:
# Training Argumente
training_args = TrainingArguments(
    output_dir="./select_your_router_path",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=7,
    weight_decay=0.1,
)


In [None]:
# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# Training
trainer.train()


In [None]:
# Evaluation auf dem Testset
preds = trainer.predict(ds_test)

# Umwandlung der Vorhersagen und wahren Labels in die benötigte Form
true_labels = ds_test["label"]
pred_labels = preds.predictions.argmax(-1)

# Classification Report
report = classification_report(true_labels, pred_labels)
print(report)


In [None]:
# Evaluation on the training set
preds_train = trainer.predict(ds_train)
true_labels_train = ds_train["label"]
pred_labels_train = preds_train.predictions.argmax(-1)
report_train = classification_report(true_labels_train, pred_labels_train)
print(report_train)


In [None]:
largest_score = []

iter = 0

for sample in ds_train:
    iter = iter + 1
    print(iter)
    largest_score.append(pipe(sample["text"])[0]["score"])

In [None]:
import numpy as np

np.mean(largest_score)

In [None]:
trainer.save_model(f"./select_your_router_path/router")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
model_name = './router_lang/checkpoint-50500'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
res = classifier("Test your router model here to see if it fits your requirements")
print(res)