In [1]:
import torch

import os

torch.cuda.is_available()


True

In [2]:


print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

x = torch.randn(10, device="cuda")
print(x)

True
NVIDIA GeForce GTX 1050 Ti
tensor([ 0.8434,  1.9352,  0.1330,  1.1786,  0.6408,  0.8275, -0.0499, -0.7203,
        -0.1425, -1.3493], device='cuda:0')


In [14]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="TypedStorage is deprecated",
    category=UserWarning
)


# Training


In [15]:
pwd

'/home/katharina/projects/jupyter/Bachelorarbeit-CIFAR10-main'

In [16]:
# --- Device + Pfade

import os, torch
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

#REPO_DIR = "/content/drive/MyDrive/Bachelorarbeit-CIFAR10"
ARTIFACTS_DIR = os.path.join("./artifacts")

RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")

DATA_DIR   = "./data"  
MODEL_DIR  = os.path.join(ARTIFACTS_DIR, "models")
RUNS_DIR   = os.path.join(ARTIFACTS_DIR, "runs")
RESULT_DIR   = os.path.join(ARTIFACTS_DIR, "results")

for d in [DATA_DIR, MODEL_DIR, RUNS_DIR, RESULT_DIR]:
    os.makedirs(d, exist_ok=True)

#print("REPO_DIR:", REPO_DIR)
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)


Device: cuda
ARTIFACTS_DIR: ./artifacts


In [17]:
import random, numpy as np

BATCH_SIZE   = 32      # CPU often faster than 32; keep same for all models
LR           = 1e-3    
WEIGHT_DECAY = 1e-4
EPOCHS       = 2      # oder 50 – je nach Zeit
PATIENCE     = 10
MIN_DELTA    = 1e-3
NUM_WORKERS  = 2       # in Colab CPU oft stabil; ggf. 2 testen
VAL_SPLIT    = 0.1
SEED         = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

print("BATCH_SIZE:", BATCH_SIZE, "EPOCHS:", EPOCHS)


BATCH_SIZE: 32 EPOCHS: 2


In [18]:
import importlib

import data as data
import train_eval as train_eval
import models.vgg16 as vgg16

importlib.reload(data)
importlib.reload(train_eval)
importlib.reload(vgg16)

from data import get_cifar10_loaders
from train_eval import train_model, eval_clean  # eval_clean ist optional, nur für kurzen Check

# --- Modelle importieren ---
from models.lenet import LeNet5_CIFAR10
from models.vgg16 import VGG16_CIFAR10_BN
from models.resnet34 import ResNet34_CIFAR10


In [19]:
# --- DataLoader (Train/Val)

trainloader, valloader, testloader = get_cifar10_loaders(
    batch_size=BATCH_SIZE,
    root=DATA_DIR,
    num_workers=NUM_WORKERS,
    val_ratio=VAL_SPLIT,
    seed=SEED
)

print("Train batches:", len(trainloader), "Val batches:", len(valloader), "Test batches:", len(testloader))


Files already downloaded and verified
Files already downloaded and verified
Train batches: 1407 Val batches: 157 Test batches: 313


In [20]:

# --- Modell wählen ---
model_name = "lenet5"   # "lenet5" | "vgg16" | "resnet34"

if model_name == "lenet5":
    model = LeNet5_CIFAR10().to(device)
elif model_name == "vgg16":
    model = VGG16_CIFAR10_BN(num_classes=10, dropout=0.0).to(device)
elif model_name == "resnet34":
    model = ResNet34_CIFAR10(num_classes=10).to(device)
else:
    raise ValueError("Unknown model_name")

print("Selected model:", model_name)
print("Model device:", next(model.parameters()).device)


Selected model: lenet5
Model device: cuda:0


In [21]:
# --- Run-Verzeichnis + Logging/Checkpoint-Pfade

model_name = f"{model_name}_{EPOCHS}ep"

RUN_DIR  = os.path.join(RUNS_DIR, f"{model_name}_{RUN_ID}")
CKPT_DIR = os.path.join(RUN_DIR, "checkpoints")

LOG_CSV  = os.path.join(RUN_DIR, f"{model_name}_metrics.csv")
LOG_JSON = os.path.join(RUN_DIR, f"{model_name}_history.json")

os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

print(model_name)
print("RUN_DIR:", RUN_DIR)


lenet5_2ep
RUN_DIR: ./artifacts/runs/lenet5_2ep_20260114_204743


In [22]:
# --- Training starten

history = train_eval.train_model(
    model=model,
    train_loader=trainloader,
    val_loader=valloader,
    device=device,
    epochs=EPOCHS,
    lr=LR,
    weight_decay=WEIGHT_DECAY,
    early_stopping=True,
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    ckpt_dir=CKPT_DIR,
    run_name=model_name,
    resume=True,
    log_csv_path=LOG_CSV,
    log_json_path=LOG_JSON
)

print("Training done. Best epoch:", history.get("best_epoch"), "Best val_loss:", history.get("best_val_loss"))


Epochs:   0%|          | 0/2 [00:00<?, ?it/s]

Train 1/2:   0%|          | 0/1407 [00:00<?, ?it/s]

Epoch 1/2 | train_loss=1.6409 | val_loss=1.4825 | val_acc=46.36% | lr=1.00e-03 | time=17.23s | gpu_peak=22.2MB


Train 2/2:   0%|          | 0/1407 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe8386687c0>
Traceback (most recent call last):
  File "/home/katharina/projects/jupyter/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/home/katharina/projects/jupyter/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.13/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe8386687c0>
Traceback (most recent call last):
  File "/home/katharina/projects/jupyter/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/home/katharina/projects/jupyter/.venv/lib/python3.13/site-

Epoch 2/2 | train_loss=1.4065 | val_loss=1.4036 | val_acc=50.10% | lr=1.00e-03 | time=19.50s | gpu_peak=22.7MB
Training done. Best epoch: 2 Best val_loss: 1.4035722696857087


In [23]:
# --- Bestes Modell exportieren
#vgg16_25ep_20260114_141539

MODEL_PATH = os.path.join(MODEL_DIR, f"{model_name}_{EPOCHS}ep.pt")
torch.save(model.state_dict(), MODEL_PATH)
print("Saved BEST model to:", MODEL_PATH)
print("Metric log CSV:", LOG_CSV)


Saved BEST model to: ./artifacts/models/lenet5_2ep_2ep_20260114_141539.pt
Metric log CSV: ./artifacts/results/lenet5_2ep_metrics.csv


In [12]:
# --- finaler Clean-Test-Check

test_loss, test_acc = eval_clean(model, testloader, device)
print(f"[FINAL CLEAN TEST] loss={test_loss:.4f} | acc={test_acc:.2f}%")


[FINAL CLEAN TEST] loss=0.4882 | acc=86.86%
