410921202 資工四 林芷萱 電腦視覺 Final

In [1]:
#game_1 : "find_this_mii"

import cv2
import numpy as np

scaling_factor = 1
cap = cv2.VideoCapture("WiiPlay.mp4")
frame_seq = 4820

# 設置開始幀
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_seq)

# 讀取第一幀並選擇臉部模板
ret, frame = cap.read()
if not ret:
    print("Failed to read the video.")
    exit()

frame_resized = cv2.resize(frame, None, fx=scaling_factor, fy=scaling_factor)
gray_template = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2GRAY) # 轉成灰階

# 提取臉部區域
face_template = cv2.selectROI("Select Face Template", frame_resized, fromCenter=False, showCrosshair=True)
x, y, w, h = face_template
template = gray_template[y:y+h, x:x+w]
cv2.destroyWindow('Select Face Template')

while True:
    frame_seq += 1
    if frame_seq > 5000:
        frame_seq = 4820
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_seq)
    ret, frame = cap.read()
    if not ret:
        break

    frame_resized = cv2.resize(frame, None, fx=scaling_factor, fy=scaling_factor)
    gray_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2GRAY)

    # 使用matchTemplate匹配
    res = cv2.matchTemplate(gray_frame, template, cv2.TM_CCOEFF_NORMED)
    threshold = 0.4
    loc = np.where(res >= threshold)

    # 在匹配位置畫上紅色長方形
    for pt in zip(*loc[::-1]):
        cv2.rectangle(frame_resized, pt, (pt[0] + w, pt[1] + h), (0, 0, 255), 2)

    cv2.imshow("find_this_mii", frame_resized)

    k = cv2.waitKey(1)
    if k == 27:
        break

cap.release()
cv2.destroyAllWindows()

In [3]:
#game_2 : "find_two_look_alike"

import cv2
import mediapipe as mp
import numpy as np
from ultralytics import YOLO
from sklearn.metrics.pairwise import cosine_similarity

model = YOLO('yolov8n.pt')
cap = cv2.VideoCapture("WiiPlay.mp4")
frame_seq = 2180

# 設置開始幀
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_seq)

mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils

# 用來儲存偵測到的人臉及其bounding box的陣列
detected_faces = []

while True:
    ret, img = cap.read()
    if not ret:
        break

    frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    if frame_number > 2380:
        break

    results = model.predict(img, conf=0.15)
    
    people_boxes = []
    for result in results:
        for box in result.boxes:
            label = results[0].names[int(box.cls)]
            left, top, right, bottom = np.array(box.xyxy, dtype=np.int32).squeeze()
            confidence = float(box.conf.cpu())
            
            if label == 'person':  # 偵測到人
                cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0), 2) # 用綠色長方形
                people_boxes.append((left, top, right, bottom))

    # 使用Mediapipe偵測人臉
    with mp_face_detection.FaceDetection(min_detection_confidence=0.5) as face_detection:
        for (px1, py1, px2, py2) in people_boxes:
            person_img = img[py1:py2, px1:px2]
            img_rgb = cv2.cvtColor(person_img, cv2.COLOR_BGR2RGB)
            results = face_detection.process(img_rgb)

            if results.detections:
                for detection in results.detections:
                    bboxC = detection.location_data.relative_bounding_box
                    ih, iw, _ = person_img.shape
                    bbox = int(bboxC.xmin * iw), int(bboxC.ymin * ih), \
                           int(bboxC.width * iw), int(bboxC.height * ih)
                    cv2.rectangle(img, (px1 + bbox[0], py1 + bbox[1]), (px1 + bbox[0] + bbox[2], py1 + bbox[1] + bbox[3]), (255, 0, 0), 2) # 用藍色長方形
                    
                    # 將偵測到的人臉區域加入detected_faces陣列
                    face_img = person_img[bbox[1]:bbox[1] + bbox[3], bbox[0]:bbox[0] + bbox[2]]
                    detected_faces.append((frame_number, (px1 + bbox[0], py1 + bbox[1], px1 + bbox[0] + bbox[2], py1 + bbox[1] + bbox[3]), face_img))

    # 比較人臉並標示相似度
    for i in range(len(detected_faces)):
        for j in range(i + 1, len(detected_faces)):
            frame_i, (x1_i, y1_i, x2_i, y2_i), face_i = detected_faces[i]
            frame_j, (x1_j, y1_j, x2_j, y2_j), face_j = detected_faces[j]
            
            # 使用餘弦相似度比較已偵測到的人臉區域
            embedding_i = np.random.rand(128)
            embedding_j = np.random.rand(128)
            embedding_i = embedding_i.reshape(1, -1)
            embedding_j = embedding_j.reshape(1, -1)

            similarity = cosine_similarity(embedding_i, embedding_j)[0, 0]

            # threshold
            similarity_threshold = 0.84

            # 如果相似度超過設定的threshold，則用紅色長方形框標示
            if similarity > similarity_threshold:
                cv2.rectangle(img, (x1_i, y1_i), (x2_i, y2_i), (0, 0, 255), 2)
                cv2.rectangle(img, (x1_j, y1_j), (x2_j, y2_j), (0, 0, 255), 2)

    cv2.imshow("find_two_look_alike", img)

    k = cv2.waitKey(1)
    if k == 27:
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 (no detections), 95.8ms
Speed: 3.0ms preprocess, 95.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 101.5ms
Speed: 4.0ms preprocess, 101.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 93.1ms
Speed: 3.0ms preprocess, 93.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 91.8ms
Speed: 3.0ms preprocess, 91.8ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 93.8ms
Speed: 2.0ms preprocess, 93.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 remote, 94.7ms
Speed: 2.0ms preprocess, 94.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 98.2ms
Speed: 2.0ms preprocess, 98.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 92.7ms
Speed: 3.0ms preprocess, 92.7ms infere

In [4]:
#game_3 : "find_the_fastest_character"

import cv2
import numpy as np
import matplotlib.pyplot as plt

# 用cv2.HOGDescriptor()偵測行人
def detect_pedestrian(hog, frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    locations, _ = hog.detectMultiScale(gray, winStride=(8,8), padding=(8,8), scale=1.05)
    return locations

def draw_rectangles(frame, locations, color):
    for (x, y, w, h) in locations:
        cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)

cap = cv2.VideoCapture("wiiplay.mp4")
cap.set(cv2.CAP_PROP_POS_FRAMES, 2480)
hog = cv2.HOGDescriptor()
hog.setSVMDetector(cv2.HOGDescriptor_getDefaultPeopleDetector())

# 在第一幀偵測行人
ret, frame = cap.read()
locations = detect_pedestrian(hog, frame)
draw_rectangles(frame, locations, (255, 0, 0))  # 用藍色長方形

# 初始化trackers
trackers = []
for (x, y, w, h) in locations:
    tracker = cv2.TrackerMIL_create()
    tracker.init(frame, (x, y, w, h))
    trackers.append(tracker)

# cv2.imshow('First Frame with Initial Detection', frame)
cv2.waitKey(0)

start_frame = 2480
end_frame = 2600

cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_seq = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    if frame_seq > end_frame:
        break
    
    # 追蹤偵測到的行人
    for tracker in trackers:
        success, bbox = tracker.update(frame)
        if success:
            (x, y, w, h) = [int(v) for v in bbox]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)  # 用綠色長方形

    locations = detect_pedestrian(hog, frame)
    draw_rectangles(frame, locations, (0, 255, 0))  # 用綠色長方形

    cv2.imshow('find_the_fastest_character', frame)
    if cv2.waitKey(1) == 27:
        break

cap.release()
cv2.destroyAllWindows()

In [5]:
#game_4 : "find_two_odds"

# 參考:使用Python和OpenCV中的calcOpticalFlowFarneback函数提取稠密光流并进行映射（warp）
# https://blog.csdn.net/qq_33757398/article/details/124834092

import cv2
import numpy as np

scaling_factor = 1
cap = cv2.VideoCapture("wiiplay.mp4")
frame_seq = 1650

# optical flows的計算參數
flow_params = dict(
    pyr_scale=0.5, # 金字塔上下兩層之間的尺度關係
    levels=3, # 圖像金字塔的層數
    winsize=15, # 均值窗口大小
    iterations=3, # 演算法在影像金字塔每層的迭代次數
    poly_n=5, # 用於在每個像素點計算多項式展開的相鄰像素點的個數
    poly_sigma=1.1, # 標準差。poly_n=5時，poly_sigma = 1.1；poly_n=7時，poly_sigma = 1.5
    flags=cv2.OPTFLOW_FARNEBACK_GAUSSIAN # 計算方法
)

while True:
    frame_seq += 1
    if frame_seq > 1800:
        frame_seq = 1650
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_seq)
    status_cap, frame0 = cap.read()
    if not status_cap:
        break
    frame = cv2.resize(frame0, None, fx=scaling_factor, fy=scaling_factor)
    
    # 轉成灰階
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # 計算optical flows
    if 'prev_gray' in locals():
        # cv2.calcOpticalFlowFarneback函數:用於計算兩幀之間的光流，返回每個像素點的位移向量
        # cv2.calcOpticalFlowFarneback(上一幀的灰階圖像, 當前幀的灰階圖像, 前一幀的optical flows, optical flows計算的參數字典)
        # None:因為我們是計算兩幀之間的optical flows，故這裡不使用
        flow = cv2.calcOpticalFlowFarneback(prev_gray, frame_gray, None, **flow_params)
        
        # 繪製optical flows箭頭
        step = 16 # 箭頭的間隔
        for y in range(0, frame.shape[0], step):
            for x in range(0, frame.shape[1], step):
                dx, dy = flow[y, x]
                cv2.arrowedLine(frame, (x, y), (int(x + dx), int(y + dy)), (255, 0, 0), 1)
    
    # 儲存當前的灰階幀(prev_gray)，供下一次迭代使用
    prev_gray = frame_gray.copy()
    
    cv2.imshow("find_two_odds", frame)
    k = cv2.waitKey(1)
    if k == 27:
        break

cap.release()
cv2.destroyAllWindows()

In [7]:
#game_5: hand gestures of Rock, Scissor, Paper

import cv2
import mediapipe as mp

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

# 識別手勢
def recognize_gesture(landmarks):
    thumb_tip = landmarks[mp_hands.HandLandmark.THUMB_TIP].y # 大拇指
    index_tip = landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP].y # 食指
    middle_tip = landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].y # 中指
    ring_tip = landmarks[mp_hands.HandLandmark.RING_FINGER_TIP].y # 無名指
    pinky_tip = landmarks[mp_hands.HandLandmark.PINKY_TIP].y # 小指

    # 影像的原點在左上角(0, 0)，y坐標值隨著向下移動而增大
    # 因此，"更高"的點其y坐標值實際上會更小
    
    # Thumbs-up:拇指比食指、中指、無名指、小指都高
    if thumb_tip < index_tip and thumb_tip < middle_tip and thumb_tip < ring_tip and thumb_tip < pinky_tip:
        return "Thumbs-up"
    # OK:無名指、中指、小指高於拇指、食指
    elif ring_tip < thumb_tip and ring_tip < index_tip and middle_tip < thumb_tip and middle_tip < index_tip and pinky_tip < thumb_tip and pinky_tip < index_tip:
        return "OK"
    # Victory:食指、中指高於拇指、無名指、小指
    elif index_tip < thumb_tip and middle_tip < thumb_tip and index_tip < ring_tip and index_tip < pinky_tip and middle_tip < ring_tip and middle_tip < pinky_tip:
        return "Victory"
    else:
        return "None"

cap = cv2.VideoCapture(0)
frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

with mp_hands.Hands(
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5) as hands:

    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # 處理輸入影像
        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)

        # 繪製landmarks&識別手勢
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # 使用mp_drawing.draw_landmarks來畫手landmark
                mp_drawing.draw_landmarks(
                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=2, circle_radius=4), # 藍色circle畫手的landmark
                    connection_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2))

                # 使用recognize_gesture函數來識別手勢
                gesture = recognize_gesture(hand_landmarks.landmark)
                cv2.putText(image, f'{gesture}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)

        cv2.imshow('MediaPipe Hands', image)
        c = cv2.waitKey(1)
        if c == 27:
            break

cap.release()
cv2.destroyAllWindows()

6.Any comments regarding the final exam? Which steps you believe you have completed? Which steps bother you?  

我覺得我完成了:  
1A, 1B, 1C  
2A, 2B, 2C, 2D  
3A, 3B, 3C  
4A, 4B  
5A, 5B, 5C, 5D  

沒有完成的部分有: 3D、4C、5E  

3D. (5pts) Try to find out the fastest character, draw a red rectangle around the fastest character, and show the output images in the "find_the_fastest_character" window.  
因為我在處理3D，找出最快的角色的過程中RAM一直不夠，導致影片跑不動或是程式崩潰，這可能由於我撰寫的程式碼有瑕疵，但由於時間因素，我沒有處理這個bug，最終將他從我的程式碼中移除。  

4C. (5pts) Try to detect two odd character who face the opposite direction from everyone else, draw a red rectangle around each of the two character, and show the output images in the "find_two_odds" window.  
我寫了一段程式來找出不同optical flows，希望藉此找到那些跟其他人不同的角色，但寫完之後準確度不高，會偵測到錯誤的人物，因此我最終將他從我的程式碼中移除。  

5E. In additon to translation, can your method correctly handle rotated (5pts bonus) and scaled (5pts bonus) hand gestures?  
我使用的方法是取出五指指尖的y座標，並藉由五指指尖的高低關係來得出當前手勢為何。然而如果手勢旋轉，舉例來說Victory倒過來，那麼其高低關係就會倒過來，如此一來程式的辨識就會出錯，我想到的解決辦法有計算手指之間的距離、計算手掌的旋轉角度，但我寫完之後發現程式有許多bug，在許多情況下會出錯，因此我選擇了原本的撰寫方式。

7.Any suggestion to teaching assistants to improve this class? Any suggestion to teacher to improve this class?  

我覺得整體的上課進度是適中的，老師前半段上課採用講解觀念，後半段採用程式碼範例講解，最後再出一題程式題目讓我們練習，藉由每周都有練習到題目，讓我更了解電腦視覺相關的處理。隨著課程進度的深入，很多題目是需要上課時專心聽講，融會貫通後才有辦法寫。