In [1]:
#YOLOv8 Classification
#pip install ultralytics

from ultralytics import YOLO

# Load a model
model = YOLO("yolov8n.pt")  # load a pretrained model (recommended for training)

# Use the model
results = model("https://ultralytics.com/images/bus.jpg" , save=True)  # predict on an image

  from .autonotebook import tqdm as notebook_tqdm

Downloading https://ultralytics.com/images/bus.jpg to 'bus.jpg'...
100%|███████████████████████████████████████████████████████████████████████████████| 476k/476k [00:00<00:00, 1.02MB/s]
image 1/1 D:\Computer_Vision\class\0515\bus.jpg: 640x480 4 persons, 1 bus, 1 stop sign, 535.1ms
Speed: 24.9ms preprocess, 535.1ms inference, 9.0ms postprocess per image at shape (1, 3, 640, 480)
Results saved to [1mruns\detect\predict[0m


In [2]:
#YOLOv8 Classification
#pip install ultralytics

from ultralytics import YOLO
import cv2
import numpy as np

model = YOLO('yolov8n.pt')
cap = cv2.VideoCapture(0)
#cap.set(3, 640)
#cap.set(4, 480)

while True:
    ret, img = cap.read()
    if ret == False:
        break
    img = cv2.flip(img, 1)
    # BGR to RGB conversion is performed under the hood
    # see: https://github.com/ultralytics/ultralytics/issues/2575
    results = model.predict(img, conf=0.25)  # Set your desired confidence threshold, default = 0.25

    for result in results:
        for box in result.boxes:
            print(box.xyxy) #box.xyxy.cpu()
            left, top, right, bottom = np.array(box.xyxy, dtype=np.uint16).squeeze() #convert from tensor to list
            print(left, top, right, bottom)
            width = right - left
            height = bottom - top
            center = (left + int((right-left)/2), top + int((bottom-top)/2))
            label = results[0].names[int(box.cls)]
            confidence = float(box.conf.cpu())

            cv2.rectangle(img, (left, top),(right, bottom), (255, 0, 0), 2)
            cv2.putText(img, label+' '+'{:.2f}'.format(confidence),(left+5, bottom-10),cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1, cv2.LINE_AA)
        
    cv2.imshow('YOLO V8 Detection', img)     
    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()


0: 480x640 (no detections), 486.0ms
Speed: 5.0ms preprocess, 486.0ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cup, 1 keyboard, 406.5ms
Speed: 6.0ms preprocess, 406.5ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 14.4951,   1.0375, 524.7303, 359.1754]])
14 1 524 359
tensor([[  1.1997, 332.9853, 450.9841, 413.9505]])
1 332 450 413
tensor([[498.9273, 132.4090, 639.7971, 357.1010]])
498 132 639 357


0: 480x640 1 person, 2 keyboards, 621.2ms
Speed: 7.0ms preprocess, 621.2ms inference, 4.5ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 57.4367,   0.0000, 525.5173, 361.8698]])
57 0 525 361
tensor([[  0.4278, 331.2693, 427.6001, 410.0147]])
0 331 427 410
tensor([[  0.7789, 332.8336, 639.4015, 428.5322]])
0 332 639 428


0: 480x640 1 person, 2 keyboards, 362.6ms
Speed: 12.3ms preprocess, 362.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[4.3441e+01, 2.3676e-01, 5.4443e+02, 3.6106e+02]])
43 0 544 361
tensor([[4.3048e-01, 3.3203e+02, 4.3482e+02, 4.0543e+02]])
0 332 434 405
tensor([[  2.0720, 331.4066, 639.5984, 428.1846]])
2 331 639 428


0: 480x640 1 person, 2 keyboards, 454.5ms
Speed: 7.0ms preprocess, 454.5ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 45.6898,   0.5630, 533.1709, 359.9066]])
45 0 533 359
tensor([[  0.5806, 332.3620, 457.7258, 406.4760]])
0 332 457 406
tensor([[  3.9765, 339.6194, 639.8061, 421.6728]])
3 339 639 421


0: 480x640 1 person, 2 keyboards, 331.1ms
Speed: 6.0ms preprocess, 331.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[2.6459e+01, 4.3372e-01, 5.4786e+02, 3.6278e+02]])
26 0 547 362
tensor([[  1.2313, 331.7412, 402.6336, 407.0311]])
1 331 402 407
tensor([[  1.0170, 333.4573, 638.8832, 432.9255]])
1 333 638 432


0: 480x640 1 person, 2 keyboards, 425.9ms
Speed: 7.0ms preprocess, 425.9ms inference, 5.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 40.4927,   0.0000, 561.7373, 363.5919]])
40 0 561 363
tensor([[  0.4321, 332.1569, 398.1884, 404.2536]])
0 332 398 404
tensor([[  0.0000, 334.8349, 613.6946, 416.9216]])
0 334 613 416


0: 480x640 1 person, 1 keyboard, 339.1ms
Speed: 7.0ms preprocess, 339.1ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[2.3480e+01, 5.1355e-01, 5.3261e+02, 3.5962e+02]])
23 0 532 359
tensor([[  1.5794, 332.6299, 514.4758, 413.5966]])
1 332 514 413


0: 480x640 1 person, 2 keyboards, 309.7ms
Speed: 3.0ms preprocess, 309.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 50.0420,   1.0793, 524.5593, 354.6973]])
50 1 524 354
tensor([[  0.4421, 330.9036, 416.8078, 407.6597]])
0 330 416 407
tensor([[  4.9692, 341.0447, 640.0000, 420.6089]])
4 341 640 420


0: 480x640 1 person, 2 keyboards, 294.2ms
Speed: 4.0ms preprocess, 294.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 47.6346,   2.6809, 529.0189, 364.8982]])
47 2 529 364
tensor([[  0.4663, 332.1002, 403.6017, 405.8344]])
0 332 403 405
tensor([[  2.0291, 331.5338, 547.1880, 414.5905]])
2 331 547 414


0: 480x640 1 person, 2 keyboards, 302.2ms
Speed: 7.0ms preprocess, 302.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 49.2817,   2.4982, 514.8207, 358.7042]])
49 2 514 358
tensor([[  0.7455, 331.5558, 436.2049, 404.7783]])
0 331 436 404
tensor([[3.6615e-01, 3.3755e+02, 6.4000e+02, 4.3022e+02]])
0 337 640 430


0: 480x640 1 person, 2 keyboards, 364.0ms
Speed: 8.0ms preprocess, 364.0ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[3.8635e+01, 3.1876e-01, 5.5753e+02, 3.6710e+02]])
38 0 557 367
tensor([[  0.6598, 331.1754, 450.7448, 411.4106]])
0 331 450 411
tensor([[  1.7671, 335.7587, 640.0000, 414.4851]])
1 335 640 414


0: 480x640 1 person, 2 keyboards, 362.1ms
Speed: 15.0ms preprocess, 362.1ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 54.3694,   1.1538, 597.9576, 363.1435]])
54 1 597 363
tensor([[  0.6179, 332.4188, 431.2454, 406.4610]])
0 332 431 406
tensor([[  0.9731, 337.6719, 640.0000, 418.0867]])
0 337 640 418


0: 480x640 1 person, 2 keyboards, 300.2ms
Speed: 4.0ms preprocess, 300.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 37.2806,   1.7401, 573.8551, 369.6096]])
37 1 573 369
tensor([[  0.5646, 332.1410, 443.2630, 410.1425]])
0 332 443 410
tensor([[  3.3140, 339.5142, 640.0000, 421.6809]])
3 339 640 421


0: 480x640 1 person, 2 keyboards, 292.2ms
Speed: 4.0ms preprocess, 292.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 57.8177,   1.1301, 611.2139, 376.3822]])
57 1 611 376
tensor([[  0.5645, 332.1559, 445.6667, 410.2110]])
0 332 445 410
tensor([[3.7427e-01, 3.3116e+02, 6.1818e+02, 4.2461e+02]])
0 331 618 424


0: 480x640 1 person, 2 keyboards, 288.7ms
Speed: 7.0ms preprocess, 288.7ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 42.0582,   1.2169, 597.5853, 367.0310]])
42 1 597 367
tensor([[  0.5869, 331.4472, 435.4838, 412.8809]])
0 331 435 412
tensor([[  0.8727, 335.8755, 640.0000, 419.8757]])
0 335 640 419


0: 480x640 1 person, 2 keyboards, 355.6ms
Speed: 7.0ms preprocess, 355.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 47.6880,   1.1813, 573.4406, 361.6923]])
47 1 573 361
tensor([[  0.5378, 331.7216, 447.4998, 408.5811]])
0 331 447 408
tensor([[  4.8061, 339.8619, 639.7431, 418.6471]])
4 339 639 418


0: 480x640 1 person, 2 keyboards, 292.7ms
Speed: 7.0ms preprocess, 292.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 33.3296,   1.5632, 590.9714, 367.4719]])
33 1 590 367
tensor([[  1.6326, 334.6691, 640.0000, 417.1328]])
1 334 640 417
tensor([[  0.5663, 331.9673, 411.6721, 407.4595]])
0 331 411 407


0: 480x640 1 person, 2 keyboards, 289.2ms
Speed: 9.0ms preprocess, 289.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 56.0801,   0.6935, 589.3120, 374.0099]])
56 0 589 374
tensor([[  0.5608, 332.2816, 476.7002, 405.8185]])
0 332 476 405
tensor([[  5.4612, 340.8344, 639.4777, 418.9467]])
5 340 639 418


0: 480x640 1 person, 2 keyboards, 311.7ms
Speed: 7.0ms preprocess, 311.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 40.1199,   1.3483, 586.1220, 366.5460]])
40 1 586 366
tensor([[  0.5047, 332.6569, 408.1354, 405.5779]])
0 332 408 405
tensor([[  1.4206, 335.4365, 640.0000, 416.2056]])
1 335 640 416


0: 480x640 1 person, 2 keyboards, 299.2ms
Speed: 5.0ms preprocess, 299.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[5.1631e+01, 3.1845e-01, 5.5111e+02, 3.6049e+02]])
51 0 551 360
tensor([[  0.6315, 331.3934, 474.1183, 415.6541]])
0 331 474 415
tensor([[  1.6951, 340.6926, 640.0000, 422.3241]])
1 340 640 422


0: 480x640 1 person, 2 keyboards, 282.3ms
Speed: 4.0ms preprocess, 282.3ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 28.6202,   0.0000, 534.7079, 364.8906]])
28 0 534 364
tensor([[  0.5234, 331.1763, 476.5544, 416.1917]])
0 331 476 416
tensor([[  1.9830, 335.2362, 640.0000, 417.9125]])
1 335 640 417


0: 480x640 1 person, 2 keyboards, 286.7ms
Speed: 11.0ms preprocess, 286.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 51.7392,   0.8460, 513.9304, 356.9515]])
51 0 513 356
tensor([[  0.5261, 331.2499, 465.2854, 417.0819]])
0 331 465 417
tensor([[  1.9949, 339.0797, 640.0000, 425.4233]])
1 339 640 425


0: 480x640 1 person, 2 keyboards, 285.2ms
Speed: 8.0ms preprocess, 285.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 48.5943,   0.0000, 510.1404, 353.9104]])
48 0 510 353
tensor([[  0.5600, 331.5117, 426.3686, 413.2901]])
0 331 426 413
tensor([[  1.4343, 335.8716, 640.0000, 418.3781]])
1 335 640 418


0: 480x640 1 person, 2 keyboards, 300.7ms
Speed: 9.0ms preprocess, 300.7ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[3.4384e+01, 3.0191e-01, 5.5482e+02, 3.5927e+02]])
34 0 554 359
tensor([[6.1890e-01, 3.3417e+02, 6.3926e+02, 4.4158e+02]])
0 334 639 441
tensor([[  0.5188, 331.0288, 462.2134, 412.7095]])
0 331 462 412


0: 480x640 1 person, 2 keyboards, 315.2ms
Speed: 6.0ms preprocess, 315.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 50.2416,   0.5321, 518.9005, 354.9004]])
50 0 518 354
tensor([[  0.4925, 330.7905, 440.7215, 413.8171]])
0 330 440 413
tensor([[  2.7285, 332.6278, 596.2950, 423.7166]])
2 332 596 423


0: 480x640 1 person, 2 keyboards, 289.2ms
Speed: 5.0ms preprocess, 289.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 50.4891,   0.9745, 531.4077, 362.2303]])
50 0 531 362
tensor([[  0.5671, 331.7377, 458.1943, 406.4651]])
0 331 458 406
tensor([[  2.1506, 333.3620, 580.0846, 424.4234]])
2 333 580 424


0: 480x640 1 person, 2 keyboards, 287.7ms
Speed: 4.0ms preprocess, 287.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 55.4550,   1.0298, 515.6289, 355.6089]])
55 1 515 355
tensor([[  0.5329, 331.4083, 417.3629, 408.7719]])
0 331 417 408
tensor([[2.0276e-01, 3.3219e+02, 6.3983e+02, 4.2845e+02]])
0 332 639 428


0: 480x640 1 person, 2 keyboards, 299.2ms
Speed: 4.0ms preprocess, 299.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 47.5201,   1.0051, 548.9156, 362.1961]])
47 1 548 362
tensor([[  0.5182, 331.6243, 424.6895, 412.8488]])
0 331 424 412
tensor([[  0.0000, 335.2504, 639.7819, 430.3438]])
0 335 639 430


0: 480x640 1 person, 2 keyboards, 302.7ms
Speed: 6.0ms preprocess, 302.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 46.0318,   0.9110, 538.3695, 360.1722]])
46 0 538 360
tensor([[  0.6234, 331.6467, 454.0021, 410.0708]])
0 331 454 410
tensor([[5.5035e-01, 3.3353e+02, 6.3970e+02, 4.3155e+02]])
0 333 639 431


0: 480x640 1 person, 1 keyboard, 273.3ms
Speed: 7.0ms preprocess, 273.3ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 51.4176,   1.3920, 512.2371, 356.8557]])
51 1 512 356
tensor([[  0.4325, 331.2148, 430.2378, 410.0185]])
0 331 430 410


0: 480x640 1 person, 2 keyboards, 286.7ms
Speed: 5.0ms preprocess, 286.7ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 50.2229,   1.9858, 512.6377, 355.1247]])
50 1 512 355
tensor([[  0.6025, 331.1154, 475.1950, 415.5295]])
0 331 475 415
tensor([[  1.4659, 336.8557, 640.0000, 415.2979]])
1 336 640 415


0: 480x640 1 person, 2 keyboards, 374.0ms
Speed: 6.0ms preprocess, 374.0ms inference, 18.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 48.0787,   0.6303, 570.2433, 365.0987]])
48 0 570 365
tensor([[  0.4853, 330.7632, 469.6505, 411.5789]])
0 330 469 411
tensor([[  3.1623, 335.5983, 640.0000, 422.4375]])
3 335 640 422


0: 480x640 1 person, 2 keyboards, 388.0ms
Speed: 5.0ms preprocess, 388.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 39.4070,   1.2854, 550.8613, 358.1499]])
39 1 550 358
tensor([[  0.5489, 330.9285, 459.0804, 410.4468]])
0 330 459 410
tensor([[  2.6451, 332.9634, 607.7883, 432.2184]])
2 332 607 432


0: 480x640 1 person, 2 keyboards, 342.1ms
Speed: 5.0ms preprocess, 342.1ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 48.9807,   1.1717, 555.4160, 360.9150]])
48 1 555 360
tensor([[  0.5385, 330.4208, 502.9276, 418.1483]])
0 330 502 418
tensor([[  6.7582, 339.7473, 639.4183, 421.7837]])
6 339 639 421


0: 480x640 1 person, 2 keyboards, 295.2ms
Speed: 4.0ms preprocess, 295.2ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[6.5580e+01, 6.9305e-02, 5.5115e+02, 3.6352e+02]])
65 0 551 363
tensor([[  0.6992, 332.2346, 424.3977, 408.1460]])
0 332 424 408
tensor([[  1.3326, 335.2369, 640.0000, 419.8247]])
1 335 640 419


0: 480x640 1 person, 1 chair, 3 keyboards, 287.2ms
Speed: 4.0ms preprocess, 287.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[5.3930e+01, 1.1722e-01, 5.8070e+02, 3.6847e+02]])
53 0 580 368
tensor([[  0.4749, 332.3427, 421.6941, 404.6927]])
0 332 421 404
tensor([[  5.5859, 339.5314, 639.4122, 421.2059]])
5 339 639 421
tensor([[  2.0662, 330.5584, 573.7052, 408.0768]])
2 330 573 408
tensor([[ 35.7975, 225.8548, 111.2858, 335.5005]])
35 225 111 335


0: 480x640 1 person, 2 keyboards, 282.8ms
Speed: 3.0ms preprocess, 282.8ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)



tensor([[ 65.2834,   1.1251, 529.4443, 360.7604]])
65 1 529 360
tensor([[  2.2452, 335.2304, 640.0000, 420.4541]])
2 335 640 420
tensor([[1.9904e-01, 3.3350e+02, 4.3778e+02, 4.1946e+02]])
0 333 437 419


0: 480x640 1 person, 3 keyboards, 292.2ms
Speed: 8.0ms preprocess, 292.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)


tensor([[  1.8519,   0.6808, 593.3256, 373.0809]])
1 0 593 373
tensor([[1.1093e-01, 3.3468e+02, 3.9375e+02, 4.0767e+02]])
0 334 393 407
tensor([[  0.6330, 333.9727, 551.3941, 412.8098]])
0 333 551 412
tensor([[  0.8573, 320.5051, 639.9390, 434.4242]])
0 320 639 434


## Practice : People detector and analyzer
1. Input images from wiiplay.mp4 with frame number between 41000 and 44000.
2. Use YOLOv8 to detect people, mark as red rectangle, and count how many persons in each frame. (hint: check label == 'person')
3. Try to find out which frame contains the most number of persons. (print the number of persons on the upper-left corner)
4. (optional) Try to find out which frame containes the largest person. (print the size of its bounding box on the upper-left corner)
5. (optional) Try to find out which frame containes the smallest person. (print the size of its bounding box on the upper-left corner)
6. Show the three output frames you found.
7. Verify the correctness of your output, then adjust the desired confidence threshold for improvement. 
8. Upload your Jupyter code file (*.ipynb)