In [1]:
!pip install mlflow-skinny
!pip install opencv-python
!pip install pycocotools
!pip install python-dotenv



In [2]:
from dotenv import load_dotenv
load_dotenv("env.txt")

True

In [3]:
import sys

sys.path.append("./toy_model")



In [9]:
import torch
from dataset import RTSDataset
from transforms import get_transform
from utils import collate_fn
from model import get_model_instance_segmentation
from engine import evaluate, train_one_epoch

In [5]:
    # Set device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
params = {
    "lr": 0.005,
    "momentum": 0.9,
    "weight_decay": 0.0005,
    "step_size": 3,
    "gamma": 0.1,
    "epochs": 1
}

In [7]:
dataset = RTSDataset("data/coco_rts_train.json", get_transform(train=True))
dataset_test = RTSDataset("data/coco_rts_valtest.json", get_transform(train=False))

# Create data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)
data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, collate_fn=collate_fn
)

# Initialize model
model = get_model_instance_segmentation(num_classes=2)
model.to(device)

# Set up optimizer and learning rate scheduler
opt_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(opt_params, lr=params["lr"], 
                            momentum=params["momentum"], 
                            weight_decay=params["weight_decay"])
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 
                                               step_size=params["step_size"], 
                                               gamma=params["gamma"])




In [8]:
import mlflow

with mlflow.start_run() as run:
    # Log the hyperparameters
    mlflow.log_params(params)
    mlflow.pytorch.autolog()
    # Training loop
    num_epochs = 2
    for epoch in range(num_epochs):
        metrics = train_one_epoch(model, 
                                  optimizer, 
                                  data_loader, 
                                  device, 
                                  epoch, 
                                  print_freq=10,
                                  run=run)
        lr_scheduler.step()
        evaluate(model, data_loader_test, device=device)
    print(metrics)
        
    mlflow.pytorch.log_model(model, "model")


Epoch: [0]  [  0/378]  eta: 0:05:50  lr: 0.000018  loss: 3.3356 (3.3356)  loss_classifier: 0.8176 (0.8176)  loss_box_reg: 0.2335 (0.2335)  loss_mask: 2.1165 (2.1165)  loss_objectness: 0.1543 (0.1543)  loss_rpn_box_reg: 0.0137 (0.0137)  time: 0.9265  data: 0.0113  max mem: 2262
Epoch: [0]  [ 10/378]  eta: 0:01:16  lr: 0.000151  loss: 2.9474 (3.0572)  loss_classifier: 0.6870 (0.6476)  loss_box_reg: 0.2335 (0.2357)  loss_mask: 1.8043 (1.8385)  loss_objectness: 0.1745 (0.3095)  loss_rpn_box_reg: 0.0196 (0.0259)  time: 0.2091  data: 0.0091  max mem: 2429
Epoch: [0]  [ 20/378]  eta: 0:01:10  lr: 0.000283  loss: 1.9810 (2.3407)  loss_classifier: 0.3381 (0.4379)  loss_box_reg: 0.1316 (0.1871)  loss_mask: 1.0531 (1.4151)  loss_objectness: 0.1299 (0.2728)  loss_rpn_box_reg: 0.0190 (0.0278)  time: 0.1603  data: 0.0296  max mem: 2429
Epoch: [0]  [ 30/378]  eta: 0:01:02  lr: 0.000416  loss: 1.2313 (1.9890)  loss_classifier: 0.1994 (0.3629)  loss_box_reg: 0.1324 (0.1877)  loss_mask: 0.6797 (1.1558) 

2024/10/20 03:50:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run upset-squid-715 at: https://pdg.mflow.software.ncsa.illinois.edu/#/experiments/0/runs/75ca418026b641c389f37f50e7ce91ac.
2024/10/20 03:50:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://pdg.mflow.software.ncsa.illinois.edu/#/experiments/0.
