In [5]:
import mlflow
import mlflow.pytorch
import torch
import yaml
from ultralytics import YOLO  # YOLOv8 is included in the ultralytics package
import os
import shutil
import wandb
import optuna
from sklearn.model_selection import KFold
import numpy as np

wandb.login()

# os.environ["MLFLOW_TRACKING_USERNAME"] = "nandhinirajasekaran-se"
# os.environ["MLFLOW_TRACKING_PASSWORD"] = "e0da9cc45afdd51fa62069d891bd56429c52b2b4"

# ✅ Set up MLflow tracking
project ='YOLOv5 Training'
mlflow.set_tracking_uri("https://dagshub.com/chinmay-nagesh/SYDE770-dagshub.mlflow")
mlflow.set_experiment(project)

def objective(trial):
    """Objective function for Optuna Hyperparameter Optimization"""
    
    experiment = "experiment_" + str(trial.number)
    
    # ✅ Suggest values for hyperparameters
    epochs = trial.suggest_int("epochs", 10, 50, step=1)
    batch_size = trial.suggest_int("batch_size", 32, 128, step=32)
    imgsz = trial.suggest_int("imgsz", 500, 800, step=32)
    lr0 = trial.suggest_float("lr0", 1e-5, 1e-3, log=True)  # Updated to suggest_float
    momentum = trial.suggest_float("momentum", 0.85, 0.95)  # Updated to suggest_float
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)  # Updated to suggest_float
    freeze_layers = trial.suggest_int("freeze_layers", 10, 20)
    
    # ✅ Define model path and dataset path
    model_path = "yolov8n.pt"  # Use YOLOv8 Nano as the base model
    dataset_yaml = "C:/Users/Chinmay Nagesh/Desktop/Yolo/SYDE770/Normal_cups/yolo_dataset/dataset.yaml"
    
    # ✅ Check for GPU availability
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # ✅ Start MLflow Experiment
    try:
        with mlflow.start_run():
            run_id = mlflow.active_run().info.run_id
            print("Logged model with run ID:", run_id)
            
            # ✅ Initialize W&B run
            wandb.init(
                project=project,
                name=experiment,
                config={
                    "epochs": epochs,
                    "batch_size": batch_size,
                    "img_size": imgsz,
                    "model": model_path,
                    "momentum": momentum,
                    "lr0": lr0,
                    "weight_decay": weight_decay,
                    "freeze_layers": freeze_layers,
                    "dataset": dataset_yaml,
                    "device": device
                }
            )
            
            # ✅ Load YOLOv8 Model
            model = YOLO(model_path)
            
            # ✅ Load dataset YAML
            with open(dataset_yaml, 'r') as f:
                dataset_config = yaml.safe_load(f)
            num_classes = dataset_config.get('nc', None)
            
            if num_classes is None:
                raise ValueError("Error: Number of classes (nc) not found in dataset YAML.")
            
            # ✅ k-Fold Cross Validation
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            fold_results = []
            
            for fold, (train_idx, val_idx) in enumerate(kfold.split(np.arange(100))):  # Assuming 100 samples for simplicity
                print(f"Fold {fold + 1}")
                
                # ✅ Train model
                results = model.train(
                    data=dataset_yaml,
                    epochs=epochs,
                    batch=batch_size,
                    imgsz=imgsz,
                    lr0=lr0,
                    momentum=momentum,
                    weight_decay=weight_decay,
                    freeze=freeze_layers,
                    project=project,
                    name=f"{experiment}_fold{fold + 1}",
                    hsv_h=0.0, hsv_s=0.0, hsv_v=0.0,
                    fliplr=0.5, flipud=0.0, 
                    mosaic=1.0, mixup=1.0, copy_paste=0.0,
                    scale=0.5, translate=0.1, shear=0.0, perspective=0.0,
                    dropout=0.2,  # Adding dropout for regularization
                    device=device  # Ensure training happens on the specified device
                )
                
                # ✅ Log Metrics
                metrics = results.results_dict  # YOLOv8 stores metrics in results_dict
                mAP50 = metrics.get("metrics/mAP50(B)", 0)
                mAP5095 = metrics.get("metrics/mAP50-95(B)", 0)
                precision = metrics.get("metrics/precision(B)", 0)
                recall = metrics.get("metrics/recall(B)", 0)
                
                mlflow.log_metric(f"fold{fold + 1}_mAP50-95", mAP5095)
                mlflow.log_metric(f"fold{fold + 1}_mAP50", mAP50)
                mlflow.log_metric(f"fold{fold + 1}_precision", precision)
                mlflow.log_metric(f"fold{fold + 1}_recall", recall)
                
                wandb.log({f"fold{fold + 1}_mAP50-95": mAP5095, f"fold{fold + 1}_mAP50": mAP50, f"fold{fold + 1}_precision": precision, f"fold{fold + 1}_recall": recall})
                
                fold_results.append(mAP5095)
            
            # ✅ Calculate average metrics across folds
            avg_mAP5095 = np.mean(fold_results)
            mlflow.log_metric("avg_mAP50-95", avg_mAP5095)
            wandb.log({"avg_mAP50-95": avg_mAP5095})
            
            # ✅ Save Best Model Checkpoint
            model_save_path = os.path.join(project, experiment, "weights")
            best_model_path = os.path.join(model_save_path, "best.pt")
            
            if os.path.exists(best_model_path):
                torch.save(model.model.state_dict(), "best.pt")
                mlflow.log_artifact(best_model_path, artifact_path=experiment)
                wandb.save(best_model_path)
            else:
                print("⚠ Warning: Best model file not found for logging.")
            
            print(f"✅ Trial {trial.number} complete: avg_mAP50-95 = {avg_mAP5095}")
        
            return avg_mAP5095  # Ensure return statement is inside with block
    
    except Exception as e:
        print("⚠ MLflow API Error:", e)
        return None  # Ensure function always returns a value

# ✅ Run Optuna Optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# ✅ Print Best Parameters
print("✅ Best Hyperparameters:", study.best_params)

# ✅ End MLflow Run
mlflow.end_run()
wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Chinmay Nagesh\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mchinmay-nagesh[0m ([33mchinmay_nagesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[I 2025-03-19 21:29:14,556] A new study created in memory with name: no-name-d7eb5651-efe6-4b1f-b7b4-dad5251160e4


Using device: cpu


[W 2025-03-19 21:29:14,945] Trial 0 failed with parameters: {'epochs': 39, 'batch_size': 64, 'imgsz': 628, 'lr0': 1.5006113956228919e-05, 'momentum': 0.9306822226345832, 'weight_decay': 0.0003497064392954925, 'freeze_layers': 13} because of the following error: The value None could not be cast to float..
[W 2025-03-19 21:29:14,946] Trial 0 failed with value None.
[W 2025-03-19 21:29:15,149] Trial 1 failed with parameters: {'epochs': 23, 'batch_size': 96, 'imgsz': 532, 'lr0': 1.332092511189097e-05, 'momentum': 0.899595643449277, 'weight_decay': 0.0004929946402945825, 'freeze_layers': 11} because of the following error: The value None could not be cast to float..


⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''
Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:15,150] Trial 1 failed with value None.
[W 2025-03-19 21:29:15,359] Trial 2 failed with parameters: {'epochs': 16, 'batch_size': 32, 'imgsz': 692, 'lr0': 6.854228645789149e-05, 'momentum': 0.9154366259765446, 'weight_decay': 0.00015415586804750543, 'freeze_layers': 16} because of the following error: The value None could not be cast to float..


Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:15,360] Trial 2 failed with value None.
[W 2025-03-19 21:29:15,557] Trial 3 failed with parameters: {'epochs': 32, 'batch_size': 128, 'imgsz': 532, 'lr0': 2.5246617959901625e-05, 'momentum': 0.9027941434823414, 'weight_decay': 0.00012440181549841901, 'freeze_layers': 12} because of the following error: The value None could not be cast to float..


Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:15,558] Trial 3 failed with value None.
[W 2025-03-19 21:29:15,760] Trial 4 failed with parameters: {'epochs': 33, 'batch_size': 96, 'imgsz': 596, 'lr0': 8.596959035258498e-05, 'momentum': 0.86357221674925, 'weight_decay': 4.823631471889016e-05, 'freeze_layers': 11} because of the following error: The value None could not be cast to float..


Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:15,762] Trial 4 failed with value None.


Using device: cpu


[W 2025-03-19 21:29:16,069] Trial 5 failed with parameters: {'epochs': 28, 'batch_size': 32, 'imgsz': 596, 'lr0': 0.0004312288451045686, 'momentum': 0.8507935938495053, 'weight_decay': 4.6476711219333786e-05, 'freeze_layers': 20} because of the following error: The value None could not be cast to float..
[W 2025-03-19 21:29:16,069] Trial 5 failed with value None.
[W 2025-03-19 21:29:16,272] Trial 6 failed with parameters: {'epochs': 27, 'batch_size': 96, 'imgsz': 500, 'lr0': 0.000435842978390437, 'momentum': 0.8989379614279502, 'weight_decay': 3.5195477650071904e-05, 'freeze_layers': 10} because of the following error: The value None could not be cast to float..


⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''
Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:16,274] Trial 6 failed with value None.
[W 2025-03-19 21:29:16,480] Trial 7 failed with parameters: {'epochs': 30, 'batch_size': 64, 'imgsz': 532, 'lr0': 3.5326061725416506e-05, 'momentum': 0.9254446118142061, 'weight_decay': 0.00014204142892949902, 'freeze_layers': 17} because of the following error: The value None could not be cast to float..


Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:16,483] Trial 7 failed with value None.


Using device: cpu


[W 2025-03-19 21:29:16,683] Trial 8 failed with parameters: {'epochs': 10, 'batch_size': 128, 'imgsz': 564, 'lr0': 0.000175215985918468, 'momentum': 0.9161389777856013, 'weight_decay': 1.4041980029552753e-05, 'freeze_layers': 13} because of the following error: The value None could not be cast to float..
[W 2025-03-19 21:29:16,684] Trial 8 failed with value None.
[W 2025-03-19 21:29:16,865] Trial 9 failed with parameters: {'epochs': 10, 'batch_size': 128, 'imgsz': 532, 'lr0': 6.902036692626329e-05, 'momentum': 0.9019101447324677, 'weight_decay': 1.06510523807279e-05, 'freeze_layers': 17} because of the following error: The value None could not be cast to float..


⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''
Using device: cpu
⚠ MLflow API Error: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: ''


[W 2025-03-19 21:29:16,867] Trial 9 failed with value None.


ValueError: No trials are completed yet.