In [1]:
import torch
from torch.utils.data import DataLoader
import sys
import os
import pathlib # Для работы с путями

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version used by PyTorch: {torch.version.cuda}") # Версия CUDA, с которой скомпилирован PyTorch
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is NOT available. PyTorch is using CPU.")
# --------------------------------------------------------------------------------
# Добавление пути к корневой директории проекта (ml_service)
module_path = os.path.abspath(os.path.join('.')) # Если ноутбук в ml_service/notebooks/
# Проверяем, что мы в правильной директории или корректируем путь
# Предполагаем, что ноутбук запускается из директории ml_service/notebooks/
# Тогда родительская директория (ml_service) должна быть в sys.path
if os.path.basename(module_path) == "notebooks":
    module_path = os.path.dirname(module_path)

if module_path not in sys.path:
    sys.path.append(module_path)
print(f"Added to sys.path: {module_path}")


from config import main_config
from src.modeling.dataset import ContractChunkDataset
from src.modeling.models import ContractVulnerabilityClassifier
from src.modeling.trainer import train_model, DEVICE # DEVICE импортируется из trainer

print(f"Main config base dir: {main_config.BASE_DIR}")
print(f"Pytorch device in use: {DEVICE}")

PyTorch version: 2.7.0+cu118
CUDA available: True
CUDA version used by PyTorch: 11.8
Number of GPUs: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 4070 SUPER
Added to sys.path: e:\Code\diplom\ml_service


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Main config base dir: E:\Code\diplom\ml_service
Pytorch device in use: cuda


In [2]:
# --------------------------------------------------------------------------------
# 2. Конфигурация эксперимента (можно вынести в отдельную ячейку или params.yaml для DVC)
# --------------------------------------------------------------------------------
# Имя токенизатора, которое использовалось при создании чанков
TOKENIZER_NAME = "microsoft/codebert-base" 
TOKENIZER_NAME_FOR_PATH = TOKENIZER_NAME.replace('/', '_')

# Параметры обучения
NUM_EPOCHS = 5 # Для начала можно небольшое количество, потом увеличить
LEARNING_RATE = 2e-5 # Типичное значение для fine-tuning
BATCH_SIZE = 16 # Должно соответствовать тому, что может выдержать ваша GPU/CPU память
# Если у вас мало VRAM, попробуйте BATCH_SIZE = 8 или 4

# Путь для сохранения лучшей модели
MODEL_FILENAME = f"vuln_classifier_{TOKENIZER_NAME_FOR_PATH}_chunks_best.pt"
MODEL_SAVE_PATH = main_config.MODEL_DIR / MODEL_FILENAME
main_config.MODEL_DIR.mkdir(parents=True, exist_ok=True) # Создаем директорию, если ее нет

MLFLOW_EXPERIMENT_NAME = main_config.EXPERIMENT_NAME

# Определяем количество типов уязвимостей из конфига
NUM_LABELS = len(main_config.VULNERABILITY_COUNT_COLUMNS)
print(f"Number of labels (vulnerability types): {NUM_LABELS}")

Number of labels (vulnerability types): 7


In [3]:
# --------------------------------------------------------------------------------
# 3. Загрузка Dataset'ов и DataLoader'ов
# --------------------------------------------------------------------------------
PROCESSED_DATA_DIR_CHUNKS = main_config.PROCESSED_DATA_DIR / "chunked_data"
file_suffix = (f"t{main_config.MAX_TOTAL_TOKENS}_c{main_config.MODEL_CHUNK_SIZE}"
               f"_o{main_config.CHUNK_OVERLAP}_{TOKENIZER_NAME_FOR_PATH}.pt")

PATH_TRAIN_CHUNKS = PROCESSED_DATA_DIR_CHUNKS / f"train_chunks_{file_suffix}"
PATH_TRAIN_CHUNK_LABELS = PROCESSED_DATA_DIR_CHUNKS / f"train_chunk_labels_{file_suffix}"
PATH_TRAIN_ORIGINAL_INDICES = PROCESSED_DATA_DIR_CHUNKS / f"train_original_indices_{file_suffix}"

PATH_TEST_CHUNKS = PROCESSED_DATA_DIR_CHUNKS / f"test_chunks_{file_suffix}"
PATH_TEST_CHUNK_LABELS = PROCESSED_DATA_DIR_CHUNKS / f"test_chunk_labels_{file_suffix}"
PATH_TEST_ORIGINAL_INDICES = PROCESSED_DATA_DIR_CHUNKS / f"test_original_indices_{file_suffix}"

# --- ОТЛАДОЧНЫЙ ВЫВОД ---
print("\n--- Debugging File Paths ---")
print(f"Looking for TRAIN chunks at: {PATH_TRAIN_CHUNKS}")
print(f"Looking for TRAIN labels at: {PATH_TRAIN_CHUNK_LABELS}")
print(f"Looking for TEST (validation) chunks at: {PATH_TEST_CHUNKS}")
print(f"Looking for TEST (validation) labels at: {PATH_TEST_CHUNK_LABELS}")
print(f"File suffix being used: {file_suffix}")
print(f"  Based on MAX_TOTAL_TOKENS: {main_config.MAX_TOTAL_TOKENS}")
print(f"  Based on MODEL_CHUNK_SIZE: {main_config.MODEL_CHUNK_SIZE}")
print(f"  Based on CHUNK_OVERLAP: {main_config.CHUNK_OVERLAP}")
print(f"  Based on TOKENIZER_NAME_FOR_PATH: '{TOKENIZER_NAME_FOR_PATH}'")
print("--- End Debugging File Paths ---\n")
# --- КОНЕЦ ОТЛАДОЧНОГО ВЫВОДА ---

train_dataset = None
val_dataset = None # Используем test_dataset как валидационный для этого примера

if PATH_TRAIN_CHUNKS.exists() and PATH_TRAIN_CHUNK_LABELS.exists():
    train_dataset = ContractChunkDataset(
        chunk_list_path=str(PATH_TRAIN_CHUNKS),
        labels_path=str(PATH_TRAIN_CHUNK_LABELS),
        original_indices_path=str(PATH_TRAIN_ORIGINAL_INDICES) if PATH_TRAIN_ORIGINAL_INDICES.exists() else None
    )
    print(f"\nTrain dataset loaded. Length: {len(train_dataset)}")
else:
    print(f"ERROR: Training data files for chunks not found. Searched at {PROCESSED_DATA_DIR_CHUNKS}")
    # assert False, "Training data not found" # Можно раскомментировать для прерывания

if PATH_TEST_CHUNKS.exists() and PATH_TEST_CHUNK_LABELS.exists():
    # В данном примере используем тестовый набор как валидационный.
    # В идеале, нужен отдельный валидационный набор.
    val_dataset = ContractChunkDataset( # Называем его val_dataset для функции train_model
        chunk_list_path=str(PATH_TEST_CHUNKS),
        labels_path=str(PATH_TEST_CHUNK_LABELS),
        original_indices_path=str(PATH_TEST_ORIGINAL_INDICES) if PATH_TEST_ORIGINAL_INDICES.exists() else None
    )
    print(f"Validation (using test set) dataset loaded. Length: {len(val_dataset)}")
else:
    print(f"ERROR: Test/Validation data files for chunks not found. Searched at {PROCESSED_DATA_DIR_CHUNKS}")
    # Дополнительный вывод для отладки, какие именно файлы не найдены
    if not PATH_TEST_CHUNKS.exists():
        print(f"  Specifically, test chunks file '{PATH_TEST_CHUNKS.name}' was NOT found in the directory.")
    if not PATH_TEST_CHUNK_LABELS.exists():
        print(f"  Specifically, test chunk labels file '{PATH_TEST_CHUNK_LABELS.name}' was NOT found in the directory.")
        
train_dataloader = None
val_dataloader = None

if train_dataset:
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    print(f"Train DataLoader created. Batches per epoch: ~{len(train_dataloader)}")
if val_dataset:
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    print(f"Validation DataLoader created. Batches per epoch: ~{len(val_dataloader)}")


--- Debugging File Paths ---
Looking for TRAIN chunks at: E:\Code\diplom\ml_service\data\processed\chunked_data\train_chunks_t4096_c512_o64_microsoft_codebert-base.pt
Looking for TRAIN labels at: E:\Code\diplom\ml_service\data\processed\chunked_data\train_chunk_labels_t4096_c512_o64_microsoft_codebert-base.pt
Looking for TEST (validation) chunks at: E:\Code\diplom\ml_service\data\processed\chunked_data\test_chunks_t4096_c512_o64_microsoft_codebert-base.pt
Looking for TEST (validation) labels at: E:\Code\diplom\ml_service\data\processed\chunked_data\test_chunk_labels_t4096_c512_o64_microsoft_codebert-base.pt
File suffix being used: t4096_c512_o64_microsoft_codebert-base.pt
  Based on MAX_TOTAL_TOKENS: 4096
  Based on MODEL_CHUNK_SIZE: 512
  Based on CHUNK_OVERLAP: 64
  Based on TOKENIZER_NAME_FOR_PATH: 'microsoft_codebert-base'
--- End Debugging File Paths ---

Loading chunk list from: E:\Code\diplom\ml_service\data\processed\chunked_data\train_chunks_t4096_c512_o64_microsoft_codebert-

In [4]:
# --------------------------------------------------------------------------------
# 4. Инициализация модели
# --------------------------------------------------------------------------------
if train_dataloader and val_dataloader : # Только если данные загружены
    model = ContractVulnerabilityClassifier(
        base_model_name=TOKENIZER_NAME, # Используем тот же, что и для токенизатора
        num_labels=NUM_LABELS,
        dropout_rate=0.1 # Можно настроить
    )
    print("\nModel initialized.")
else:
    print("\nSkipping model initialization and training due to missing data.")
    model = None


Model initialized with base: microsoft/codebert-base
Hidden size: 768
Number of labels: 7

Model initialized.


In [5]:
# --------------------------------------------------------------------------------
# 5. Запуск обучения
# --------------------------------------------------------------------------------
if model and train_dataloader and val_dataloader:
    print("\n--- Starting Model Training ---")
    trained_model = train_model(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        num_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        model_save_path=str(MODEL_SAVE_PATH),
        mlflow_experiment_name=MLFLOW_EXPERIMENT_NAME
    )
    print("\n--- Model Training Finished ---")
    print(f"Best model saved to: {MODEL_SAVE_PATH if MODEL_SAVE_PATH.exists() else 'MLflow artifacts'}")
else:
    print("\nSkipping training.")


--- Starting Model Training ---
MLflow Run ID: 95883e75e9534216a913a6957cda855b

Starting training for 5 epochs...

--- Epoch 1/5 ---


                                                                                       

Train Loss: 0.3602
  Train accuracy_exact_match: 0.3928
  Train f1_micro: 0.7918
  Train precision_micro: 0.8245
  Train recall_micro: 0.7616
  Train f1_macro: 0.6492
  Train precision_macro: 0.7956
  Train recall_macro: 0.5986
  Train roc_auc_macro: 0.8514


                                                                           

Validation Loss: 0.3284
  Validation accuracy_exact_match: 0.4481
  Validation f1_micro: 0.8048
  Validation precision_micro: 0.8801
  Validation recall_micro: 0.7414
  Validation f1_macro: 0.7077
  Validation precision_macro: 0.8461
  Validation recall_macro: 0.6265
  Validation roc_auc_macro: 0.8899
New best validation F1-macro: 0.7077. Saving model...

--- Epoch 2/5 ---


                                                                                      

Train Loss: 0.2822
  Train accuracy_exact_match: 0.4993
  Train f1_micro: 0.8456
  Train precision_micro: 0.8729
  Train recall_micro: 0.8200
  Train f1_macro: 0.7609
  Train precision_macro: 0.8471
  Train recall_macro: 0.7085
  Train roc_auc_macro: 0.9148


                                                                           

Validation Loss: 0.3058
  Validation accuracy_exact_match: 0.4985
  Validation f1_micro: 0.8318
  Validation precision_micro: 0.8791
  Validation recall_micro: 0.7894
  Validation f1_macro: 0.7539
  Validation precision_macro: 0.8447
  Validation recall_macro: 0.6929
  Validation roc_auc_macro: 0.9066
New best validation F1-macro: 0.7539. Saving model...

--- Epoch 3/5 ---


                                                                                      

Train Loss: 0.2376
  Train accuracy_exact_match: 0.5635
  Train f1_micro: 0.8733
  Train precision_micro: 0.8980
  Train recall_micro: 0.8500
  Train f1_macro: 0.8107
  Train precision_macro: 0.8769
  Train recall_macro: 0.7632
  Train roc_auc_macro: 0.9410


                                                                           

Validation Loss: 0.2976
  Validation accuracy_exact_match: 0.5250
  Validation f1_micro: 0.8448
  Validation precision_micro: 0.8838
  Validation recall_micro: 0.8091
  Validation f1_macro: 0.7683
  Validation precision_macro: 0.8579
  Validation recall_macro: 0.7098
  Validation roc_auc_macro: 0.9114
New best validation F1-macro: 0.7683. Saving model...

--- Epoch 4/5 ---


                                                                                      

Train Loss: 0.2058
  Train accuracy_exact_match: 0.6148
  Train f1_micro: 0.8928
  Train precision_micro: 0.9152
  Train recall_micro: 0.8716
  Train f1_macro: 0.8424
  Train precision_macro: 0.8966
  Train recall_macro: 0.8001
  Train roc_auc_macro: 0.9562


                                                                           

Validation Loss: 0.3007
  Validation accuracy_exact_match: 0.5438
  Validation f1_micro: 0.8526
  Validation precision_micro: 0.8788
  Validation recall_micro: 0.8279
  Validation f1_macro: 0.7891
  Validation precision_macro: 0.8436
  Validation recall_macro: 0.7472
  Validation roc_auc_macro: 0.9144
New best validation F1-macro: 0.7891. Saving model...

--- Epoch 5/5 ---


                                                                                      

Train Loss: 0.1812
  Train accuracy_exact_match: 0.6543
  Train f1_micro: 0.9065
  Train precision_micro: 0.9267
  Train recall_micro: 0.8871
  Train f1_macro: 0.8642
  Train precision_macro: 0.9108
  Train recall_macro: 0.8260
  Train roc_auc_macro: 0.9664


                                                                           

Validation Loss: 0.3080
  Validation accuracy_exact_match: 0.5600
  Validation f1_micro: 0.8593
  Validation precision_micro: 0.8792
  Validation recall_micro: 0.8402
  Validation f1_macro: 0.7955
  Validation precision_macro: 0.8448
  Validation recall_macro: 0.7583
  Validation roc_auc_macro: 0.9153
New best validation F1-macro: 0.7955. Saving model...

Training finished.

--- Model Training Finished ---
Best model saved to: E:\Code\diplom\ml_service\models\vuln_classifier_microsoft_codebert-base_chunks_best.pt
