In [181]:
import cv2
import torch
import torchvision
from torchvision import transforms as T
import numpy as np
import time
import json
from pathlib import Path
from torch.utils.data import Dataset, DataLoader,random_split
import os
INPUT_VIDEO = str(Path("crowd.mp4").resolve())
OUTPUT_VIDEO = str(Path("crowd_1.mp4").resolve())
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CONF_THRESH = 0.5
PERSON_CLASS_ID = 1
SCALE = 2.0
DOWNSAMPLE_FOR_SPEED = 1
INPUT_FRAMES_DIR = "frames"
ANNOTATION_FILE = "instances_Validation.json"
BATCH_SIZE = 2
NUM_EPOCHS = 5
LR = 1e-4
print(DEVICE)

cuda


In [182]:
class VideoFramesDataset(Dataset):
    def __init__(self, frames_dir, annotation_file, transform=None):
        self.frames_dir = frames_dir
        self.transform = transform
        with open(annotation_file, "r") as f:
            self.coco = json.load(f)

        self.images = {im['id']: im for im in self.coco['images']}
        self.annotations = {}
        for ann in self.coco['annotations']:
            self.annotations.setdefault(ann['image_id'], []).append(ann)

        self.ids = list(self.images.keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_info = self.images[img_id]
        img_path = os.path.join(self.frames_dir, img_info['file_name'])
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if SCALE != 1.0:
            height, width = int(img.shape[0]*SCALE), int(img.shape[1]*SCALE)
            img = cv2.resize(img, (width, height))

        boxes = []
        labels = []
        for ann in self.annotations.get(img_id, []):
            x, y, w, h = ann['bbox']
            if SCALE != 1.0:
                x, y, w, h = x * SCALE, y * SCALE, w * SCALE, h * SCALE
            boxes.append([x, y, x + w, y + h])
            labels.append(ann['category_id'])


        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        target = {"boxes": boxes,
                  "labels": labels,
                  "image_id": torch.tensor([img_id]),
                  "image_name": img_info['file_name']
                }

        if self.transform:
            img = self.transform(img)

        return img, target

In [183]:
transform = T.Compose([
    T.ToTensor(),
])
dataset = VideoFramesDataset(INPUT_FRAMES_DIR, ANNOTATION_FILE, transform=transform)

train_size = int(0.3 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [184]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=True, progress=True)
for param in model.backbone.parameters():
    param.requires_grad = False
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    in_features, num_classes=2
)
model.to(DEVICE)
model.train()


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
model.load_state_dict(torch.load("model.pth"))
model.to(DEVICE)

In [185]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=LR, weight_decay=1e-4)

In [196]:
import sys
print(sys.executable)

C:\Users\festa\PycharmProjects\object_detect\gpu-env\Scripts\python.exe


In [186]:
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0.0
    for imgs, targets in train_loader:
        imgs = list(img.to(DEVICE) for img in imgs)

        #targets = [{k: v.to(DEVICE) for k, v, n, m in t.items()} for t in targets]
        targets = [
                {k: v.to(DEVICE) for k, v in t.items() if k in ["boxes", "labels"]}
                for t in targets
                  ]
        loss_dict = model(imgs, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        total_loss += losses.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {avg_loss:.4f}")

Epoch [1/5], Loss: 1.2420
Epoch [2/5], Loss: 0.9768
Epoch [3/5], Loss: 0.8599
Epoch [4/5], Loss: 0.7826
Epoch [5/5], Loss: 0.7281


In [187]:
model.eval()

results = []

for i, (images, targets) in enumerate(val_loader):
    images = list(img.to(DEVICE) for img in images)
    outputs = model(images)

    for j, output in enumerate(outputs):
        image_id = targets[j]['image_id'] if 'image_id' in targets[j] else val_loader.dataset.ids[j]

        for box, label, score in zip(output['boxes'], output['labels'], output['scores']):
            x1, y1, x2, y2 = box.cpu().detach().numpy()
            w = x2 - x1
            h = y2 - y1

            results.append({
                "image_id": image_id.item(),
                "category_id": int(label.cpu().detach().numpy()),
                "bbox": [float(x1), float(y1), float(w), float(h)],
                "score": float(score.cpu().detach().numpy())
            })

with open("results_val.json", "w") as f:
    json.dump(results, f)

In [188]:
def draw_transparent_box(img, xyxy, label_text, score, box_color=(0,255,0), alpha=0.25):
    x1, y1, x2, y2 = map(int, xyxy)
    overlay = img.copy()

    cv2.rectangle(overlay, (x1, y1), (x2, y2), box_color, -1)
    cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0, img)

    cv2.rectangle(img, (x1, y1), (x2, y2), box_color, 2)

    text = f"{label_text}: {score:.2f}"
    (tw, th), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
    tx, ty = x1, max(0, y1 - 6)

    cv2.rectangle(img, (tx, ty - th - baseline), (tx + tw, ty + baseline), (0,0,0), -1)
    cv2.putText(img, text, (tx, ty - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1, cv2.LINE_AA)

In [189]:
def non_max_suppression(boxes, scores, iou_threshold=0.5):
    idxs = torchvision.ops.nms(torch.tensor(boxes).float(), torch.tensor(scores).float(), iou_threshold)
    return idxs.numpy()

In [190]:
model.eval()
cap = cv2.VideoCapture(INPUT_VIDEO)
if not cap.isOpened():
    raise RuntimeError(f"Не удалось открыть видео {INPUT_VIDEO}")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * SCALE)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * SCALE)
out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps / DOWNSAMPLE_FOR_SPEED, (width, height))
frame_idx = 0
total_time = 0.0
processed = 0

In [191]:
coco_boxes = []
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_idx += 1

        if SCALE != 1.0:
            frame = cv2.resize(frame, (width, height))

        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        tensor = transform(img_rgb).to(DEVICE)

        t0 = time.time()
        with torch.no_grad():
            outputs = model([tensor])
        t1 = time.time()
        elapsed = t1 - t0
        total_time += elapsed
        processed += 1

        out_dict = outputs[0]
        boxes = out_dict['boxes'].cpu().numpy()
        labels = out_dict['labels'].cpu().numpy()
        scores = out_dict['scores'].cpu().numpy()

        mask_person = labels == PERSON_CLASS_ID
        boxes = boxes[mask_person]
        scores = scores[mask_person]

        keep_mask = scores >= CONF_THRESH
        boxes = boxes[keep_mask]
        scores = scores[keep_mask]

        if len(boxes) > 0:
            keep_idxs = non_max_suppression(boxes, scores, iou_threshold=0.5)
            boxes = boxes[keep_idxs]
            scores = scores[keep_idxs]

            for bb, sc in zip(boxes, scores):
                x1, y1, x2, y2 = bb
                draw_transparent_box(frame, (x1, y1, x2, y2), "person", float(sc), box_color=(0,200,0), alpha=0.18)
                w, h = x2 - x1, y2 - y1
                coco_boxes.append({"image_id": frame_idx,
                                    "category_id": PERSON_CLASS_ID,
                                    "bbox": [float(x1), float(y1), float(w), float(h)],
                                    "score": float(sc)
                                    })
        avg_inf = total_time / processed if processed > 0 else 0.0
        info_text = f"Inf time/frame: {avg_inf:.3f}s, device: {DEVICE}"
        cv2.putText(frame, info_text, (10, height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (255,255,255), 1, cv2.LINE_AA)
        out.write(frame)

finally:
    cap.release()
    out.release()


In [192]:
print(f"Сохранено в {OUTPUT_VIDEO}. Среднее время инференса на кадр = {total_time/processed:.3f}s (для {processed} кадров).")

Сохранено в C:\Users\festa\PycharmProjects\object_detect\crowd_1.mp4. Среднее время инференса на кадр = 0.082s (для 705 кадров).


In [193]:
with open("instances_Validation.json") as f:
    data = json.load(f)

for ann in data["annotations"]:
    x, y, w, h = ann["bbox"]
    ann["bbox"] = [x * SCALE, y * SCALE, w * SCALE, h * SCALE]

with open("instances_scaled.json", "w") as f:
    json.dump(data, f, indent=2)

print("BBoxes масштабированы под SCALE =", SCALE)

BBoxes масштабированы под SCALE = 2.0


In [194]:
with open("results.json", "w") as f:
    json.dump(coco_boxes, f, indent=2)

print("Результаты сохранены в results.json")

Результаты сохранены в results.json


In [195]:
torch.save(model.state_dict(), "model.pth")