In [None]:
# !pip install ultralytics

In [14]:
import torch

print(torch.__version__)

2.0.1+cu118


In [11]:
from ultralytics import YOLO
import cv2

# Create a YOLOv8 model
model = YOLO("yolov8n.pt")

# Start webcam
cap = cv2.VideoCapture(0)

# Open the file to save detection results
f = open("object_locations.txt", "w")

while True:
    # Read one frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Predict using the YOLO model
    results = model.predict(frame)

    # Iterate over each detection
    for i in range(results.shape[1]):
        detection = results[0][i]
        
        # Here we are assuming that the first 4 elements are the bounding box coordinates,
        # the 5th element is the objectness score, and the rest are the class scores.
        bbox = detection[:4]
        objectness = detection[4]
        class_scores = detection[5:]
        
        # Find the class with the highest score
        class_id = np.argmax(class_scores)
        class_score = class_scores[class_id]

        # We will consider the detection valid if the confidence score is greater than 0.85
        if class_score > 0.85:
            # Write to file, draw bounding boxes, etc.
            # Note that the bbox coordinates will depend on how they are represented in the output.
            # For example, they could be [center_x, center_y, width, height] or [x1, y1, x2, y2].
            f.write(f"Class: {class_id}, BBox: {bbox}\n")

            # Draw the bounding box
            # Note that we are assuming the bbox coordinates are in the format [center_x, center_y, width, height].
            # Depending on the model, this might be different (e.g., [x1, y1, x2, y2]).
            x1, y1 = int(bbox[0] - bbox[2] / 2), int(bbox[1] - bbox[3] / 2)
            x2, y2 = int(bbox[0] + bbox[2] / 2), int(bbox[1] + bbox[3] / 2)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw the class and confidence score
            label = f"Class: {class_id}, Confidence: {class_score:.2f}"
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Frame", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()
f.close()

Downloading https:\github.com\ultralytics\assets\releases\download\v0.0.0\yolov8n.pt to yolov8n.pt...
100%|██████████| 6.23M/6.23M [00:00<00:00, 14.9MB/s]


TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
from ultralytics import YOLO

model = YOLO("yolov8m.pt") 
model.export(format="onnx", imgsz=[480,640])

In [None]:
# !pip install onnxruntime

In [4]:
import onnxruntime
import numpy as np

# ONNX 모델 로드
ort_session = onnxruntime.InferenceSession("yolov8m.onnx")

# 입력 텐서 크기 설정
input_shape = (1, 3, 480, 640)
x = np.random.random(input_shape).astype(np.float32)

# ONNX 런타임에서 입력 및 출력 이름 가져오기
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

# ONNX 모델 실행
result = ort_session.run([output_name], {input_name: x})

# 결과 출력
print(result)

[array([[[5.1777692e+00, 1.0288152e+01, 1.7757238e+01, ...,
         5.3694403e+02, 5.7199719e+02, 5.8175946e+02],
        [3.7307391e+00, 3.7775056e+00, 4.2060409e+00, ...,
         4.2660059e+02, 4.1951974e+02, 4.0324100e+02],
        [1.0757503e+01, 2.0699392e+01, 3.4478863e+01, ...,
         2.6600211e+02, 2.8358252e+02, 2.6613248e+02],
        ...,
        [7.7486038e-07, 5.3644180e-07, 4.4703484e-07, ...,
         1.6391277e-06, 2.0861626e-06, 1.9073486e-06],
        [4.4703484e-07, 2.0861626e-07, 2.0861626e-07, ...,
         2.0265579e-06, 2.2649765e-06, 2.1755695e-06],
        [1.1920929e-06, 4.4703484e-07, 2.9802322e-07, ...,
         1.9073486e-06, 2.1457672e-06, 2.1457672e-06]]], dtype=float32)]


In [56]:
import cv2
import onnxruntime
import numpy as np
from PIL import Image
from torchvision import transforms

# ONNX 모델 로드
ort_session = onnxruntime.InferenceSession("yolov8m.onnx")

# ONNX 런타임에서 입력 및 출력 이름 가져오기
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

# 웹캠을 사용하거나 비디오 파일을 읽기 위한 VideoCapture 생성
cap = cv2.VideoCapture(0) # Use 0 for webcam, or replace with video file path

# Open the file to save detection results
f = open("object_locations.txt", "w")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # OpenCV는 BGR을 사용하지만, PyTorch는 RGB를 사용하므로 색상을 변환합니다.
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # 이미지를 PIL 이미지로 변환합니다.
    pil_image = Image.fromarray(frame)
    # 이미지를 모델 입력에 맞게 전처리 합니다.
    # 모델 입력 크기에 따라 변경해야 합니다.
    preprocess = transforms.Compose([
        transforms.Resize((480, 640)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    img = preprocess(pil_image)
    img = img.unsqueeze(0).numpy()

    # ONNX 모델 실행
    # results = ort_session.run([output_name], {input_name: img})
    results = ort_session.run([output_name], {input_name: img})
    # 모델의 결과를 확인합니다.
    print(f"Model output shape: {results[0].shape}")
    print(f"First detection output: {results[0][0]}")

    # Iterate over each detection
    for detection in results[0]:
        if detection[4] > 0.85:  # If the confidence score is greater than 0.85
            f.write(f"Class: {int(detection[5])}, BBox: {detection[:4]}\n")

            # Draw the bounding box
            x1, y1, x2, y2 = map(int, detection[:4])
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw the class and confidence score
            label = f"Class: {int(detection[5])}, Confidence: {detection[4]:.2f}"
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Frame", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()
f.close()


NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from yolov8m.onnx failed:Load model yolov8m.onnx failed. File doesn't exist

In [None]:
import cv2

from yolov8 import YOLOv8

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Initialize YOLOv7 object detector
model_path = "models/yolov8m.onnx"
yolov8_detector = YOLOv8(model_path, conf_thres=0.5, iou_thres=0.5)

cv2.namedWindow("Detected Objects", cv2.WINDOW_NORMAL)
while cap.isOpened():

    # Read frame from the video
    ret, frame = cap.read()

    if not ret:
        break

    # Update object localizer
    boxes, scores, class_ids = yolov8_detector(frame)

    combined_img = yolov8_detector.draw_detections(frame)
    cv2.imshow("Detected Objects", combined_img)

    # Press key q to stop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

## YOLOV5

In [22]:
# !git clone https://github.com/ultralytics/yolov5.git
# %cd yolov5
# !pip install -r requirements.txt





In [23]:
print(torch.__version__)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

Using cache found in C:\Users\user/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2023-7-17 Python-3.9.17 torch-2.0.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)



2.0.1+cu118


Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:01<00:00, 9.48MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


## 성능확인

In [12]:
import cv2
import torch
from PIL import Image
import time

# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.hub.load('ultralytics/yolov5', 'yolov5s').to(device)  # use the appropriate model

# Initialize webcam
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

    # Convert to PIL Image
    img = Image.fromarray(frame)

    # Perform inference
    start_time = time.time()
    results = model(img)
    inference_time = time.time() - start_time

    # Compute FPS (Frames Per Second)
    fps = 1.0 / inference_time

    # Draw bounding boxes and labels on the image
    annotated_image = results.render()
    annotated_image = annotated_image[0]  # if using yolov5s, else remove this line

    # Print inference time and FPS
    # print(f'Inference time: {inference_time:.2f}s')
    print(f'FPS: {fps:.2f}')

    # Convert to BGR format for display
    # annotated_image = cv2.cvtColor(annotated_image)
    # but cv2 does not need this

    # Display the resulting frame
    cv2.imshow('YOLOv5 Object Detection', annotated_image)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

Using cache found in C:\Users\user/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2023-7-17 Python-3.9.17 torch-2.0.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


FPS: 75.86
FPS: 62.59
FPS: 77.04
FPS: 125.04
FPS: 99.94
FPS: 91.16
FPS: 91.07
FPS: 110.58
FPS: 111.43
FPS: 91.15
FPS: 100.01
FPS: 94.71
FPS: 111.11
FPS: 99.01
FPS: 68.72
FPS: 90.02
FPS: 111.34
FPS: 76.95
FPS: 83.33
FPS: 91.12
FPS: 85.72
FPS: 91.24
FPS: 99.79
FPS: 125.02
FPS: 111.03
FPS: 124.82
FPS: 125.03
FPS: 90.93
FPS: 83.34
FPS: 111.05
FPS: 66.77
FPS: 110.33
FPS: 105.34
FPS: 124.91
FPS: 111.40
FPS: 117.61
FPS: 121.19
FPS: 111.11
FPS: 69.93
FPS: 125.02
FPS: 125.44
FPS: 110.08
FPS: 110.74
FPS: 108.31
FPS: 122.64
FPS: 99.99
FPS: 110.95
FPS: 87.12
FPS: 126.74
FPS: 125.02
FPS: 91.74
FPS: 100.19
FPS: 111.37
FPS: 99.32
FPS: 58.75
FPS: 83.34
FPS: 74.13
FPS: 90.88
FPS: 83.34
FPS: 63.49
FPS: 83.48
FPS: 83.55
FPS: 95.20
FPS: 64.78
FPS: 90.93
FPS: 90.08
FPS: 76.90
FPS: 91.06
FPS: 103.28
FPS: 90.91
FPS: 100.01
FPS: 92.48
FPS: 83.47
FPS: 69.11
FPS: 85.91
FPS: 83.34
FPS: 90.73
FPS: 84.95
FPS: 90.92
FPS: 100.22
FPS: 105.57
FPS: 83.44
FPS: 106.97
FPS: 101.64
FPS: 90.46
FPS: 91.09
FPS: 99.99
FPS: 99.

## EXPROT AS ONXX

In [10]:
from ultralytics import YOLO
import matplotlib.pyplot as plt
import cv2,torch
# Check if CUDA is available and if so, use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # Load the model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s').to(device)
# Prepare the dummy input
dummy_input = torch.randn(1, 3, 640, 640).to(device)
# Export the model to an ONNX file with do_constant_folding=False
torch.onnx.export(model, dummy_input, "test.onnx", verbose=True, opset_version=16, do_constant_folding=False)

Using cache found in C:\Users\user/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2023-7-17 Python-3.9.17 torch-2.0.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:03<00:00, 4.89MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
  y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)
  if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:


verbose: False, log level: Level.ERROR



In [13]:
from yolov5 import YOLOv5
import cv2
import numpy as np

# Create a YOLOv5 model
model = YOLOv5("yolov5s.pt")

# Start webcam
cap = cv2.VideoCapture(0)
# These are the class names for the COCO dataset used by YOLOv5. If you are using a custom model, replace these names with your actual class names.
class_names = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop_sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

# Open the file to save detection results
f = open("object_locations_yolov5.txt", "w")

# Performance metrics
tp, fp, fn = 0, 0, 0

while True:
    # Read one frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Predict using the YOLO model
    results = model.predict(frame)
    detections = results.pred[0]  # Use the 'pred' attribute to get the detections

    # Iterate over each detection
    for *bbox, confidence, class_id in detections:
        # The detection coordinates are normalized, i.e., they are in the range [0, 1].
        # Therefore, we need to multiply them by the width and height of the frame to convert them to pixel values.
        x_center, y_center, width, height = map(lambda x: int(x * frame.shape[1] if x < 2 else x * frame.shape[0]), bbox)
        class_id = int(class_id)

        # We will consider the detection valid if the confidence score is greater than 0.85
        if confidence > 0.85:
            # Write to file
            f.write(f"Class: {class_id}, BBox: {[x_center, y_center, width, height]}\n")

            # Draw the bounding box
            x1, y1, x2, y2 = x_center - width // 2, y_center - height // 2, x_center + width // 2, y_center + height // 2
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw the class and confidence score
            class_name = class_names[class_id]
            label = f"{class_name}: {confidence:.2f}"

            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            # For performance metrics, you will need ground truth data for each frame.
            # This data is usually not available in a live webcam feed.

    # Display the frame
    cv2.imshow("Frame", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()
f.close()

YOLOv5  2023-7-18 Python-3.9.17 torch-2.0.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


## ONLY PERSON

In [6]:
from yolov5 import YOLOv5
import cv2

# Create a YOLOv5 model
model = YOLOv5("yolov5s.pt")

# Start webcam
cap = cv2.VideoCapture(0)

# These are the class names for the COCO dataset used by YOLOv5.
class_names = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop_sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

# Open the file to save detection results
f = open("person_locations_yolov5.txt", "w")

while True:
    # Read one frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Predict using the YOLO model
    results = model.predict(frame)
    detections = results.pred[0]  # Use the 'pred' attribute to get the detections

    # Iterate over each detection
    for *bbox, confidence, class_id in detections:
        x_center, y_center, width, height = map(lambda x: int(x * frame.shape[1] if x < 2 else x * frame.shape[0]), bbox)
        class_id = int(class_id)

        # We will only consider the detection valid if the confidence score is greater than 0.85 and the class_id corresponds to 'person'
        if confidence > 0.85 and class_id == class_names.index('person'):
            # Write to file
            f.write(f"Class: {class_id}, BBox: {[x_center, y_center, width, height]}\n")

            # Draw the bounding box
            x1, y1, x2, y2 = x_center - width // 2, y_center - height // 2, x_center + width // 2, y_center + height // 2
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Draw the class and confidence score
            class_name = class_names[class_id]
            label = f"{class_name}: {confidence:.2f}"

            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Frame", frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
cv2.destroyAllWindows()
f.close()

YOLOv5  2023-7-18 Python-3.9.17 torch-2.0.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

  from .autonotebook import tqdm as notebook_tqdm
Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
