In [None]:
from ultralytics import YOLO

model = YOLO("yolo11n.pt")  

results = model.train(data="data.yaml", epochs=100, imgsz=640)

In [11]:
from ultralytics import YOLO
import cv2
from PIL import Image, ImageDraw, ImageFont
import numpy as np

# Load YOLO models
person_model = YOLO("yolo11n.pt")
emotion_model = YOLO("runs/detect/train11/weights/last.pt")

# Mapping cảm xúc sang Tiếng Việt
emotion_mapping = {
    "Anger": "Giận dữ",
    "Contempt": "Khinh thường",
    "Disgust": "Chán ghét",
    "Fear": "Sợ hãi",
    "Happy": "Vui vẻ",
    "Neutral": "Bình thường",
    "Sad": "Buồn bã",
    "Surprise": "Ngạc nhiên",
}

# Load font Unicode (tải font Arial hoặc Tahoma nếu chưa có)
font_path = "arial.ttf"  # Thay đổi đường dẫn tới file font của bạn
font = ImageFont.truetype(font_path, 20)

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Chuyển frame sang dạng PIL để vẽ chữ Unicode
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)

    # Nhận diện người từ model đầu tiên
    person_results = person_model(frame)

    for result in person_results[0].boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result
        if int(class_id) == 0:  # Chỉ xử lý người
            person_crop = frame[int(y1):int(y2), int(x1):int(x2)]
            emotion_results = emotion_model(person_crop)

            if len(emotion_results[0].boxes.data) > 0:
                emotion_class_id = int(emotion_results[0].boxes.data[0][5])
                english_emotion = emotion_model.names[emotion_class_id]
                emotion_label = emotion_mapping.get(english_emotion, "Không xác định")
            else:
                emotion_label = "Không xác định"

            # Vẽ bounding box
            draw.rectangle([x1, y1, x2, y2], outline="Red", width=3)

            # Hiển thị nhãn cảm xúc bằng tiếng Việt
            draw.text((x1 + 5, y1 - 25), f"Cảm xúc: {emotion_label}", font=font, fill="red")

    # Chuyển lại về OpenCV để hiển thị
    frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    cv2.imshow("Nhan dien cam xuc", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 19.9ms
Speed: 2.6ms preprocess, 19.9ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 16.9ms
Speed: 3.7ms preprocess, 16.9ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 608x640 1 Contempt, 16.9ms
Speed: 4.7ms preprocess, 16.9ms inference, 1.0ms postprocess per image at shape (1, 3, 608, 640)

0: 480x640 1 person, 13.9ms
Speed: 1.0ms preprocess, 13.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 640x640 (no detections), 16.4ms
Speed: 3.6ms preprocess, 16.4ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 person, 13.8ms
Speed: 1.0ms preprocess, 13.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 544x640 1 Neutral, 13.5ms
Speed: 3.1ms preprocess, 13.5ms inference, 2.0ms postprocess per image at shape (1, 3, 544, 640)

0: 480x640 1 person, 13.0ms
Speed: 2.0ms preprocess, 13.0ms inference, 2.1ms postprocess per image a

In [15]:
!yolo task=detect mode=predict model="yolo11n.pt" source="IMG_20220711_180148.jpg"

Ultralytics 8.3.49 🚀 Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
YOLO11n summary (fused): 238 layers, 2,616,248 parameters, 0 gradients, 6.5 GFLOPs

image 1/1 d:\Downloads\Compressed\Train\IMG_20220711_180148.jpg: 480x640 16 persons, 1 umbrella, 60.2ms
Speed: 1.5ms preprocess, 60.2ms inference, 114.7ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns\detect\predict3[0m
💡 Learn more at https://docs.ultralytics.com/modes/predict
