In [4]:
import os
import re
import cv2
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from tqdm import tqdm
import dlib
import cv2
import numpy as np
from imutils import face_utils
from EAR import eye_aspect_ratio
from MAR import mouth_aspect_ratio


In [None]:

 
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor('./dlib_shape_predictor/shape_predictor_68_face_landmarks.dat')

EYE_AR_THRESH = 0.154
MOUTH_AR_THRESH = 0.685

(lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
(rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
(mStart, mEnd) = (49, 68)


In [6]:
import os
import numpy as np
import cv2
from tqdm import tqdm
from imutils import face_utils

# 只用 Drowsy 类数据
train_dir = "../data/splitted_Data/train/Drowsy"

ear_values = []
mar_values = []

for img_name in tqdm(os.listdir(train_dir), desc="Processing Drowsy"):
    if not img_name.lower().endswith(".png"):
        continue
    
    img_path = os.path.join(train_dir, img_name)
    frame = cv2.imread(img_path)
    if frame is None:
        continue
        
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    rects = detector(gray, 0)
    if len(rects) == 0:
        continue
    
    rect = rects[0]
    shape = predictor(gray, rect)
    shape = face_utils.shape_to_np(shape)

    leftEye = shape[lStart:lEnd]
    rightEye = shape[rStart:rEnd]
    ear = (eye_aspect_ratio(leftEye) + eye_aspect_ratio(rightEye)) / 2.0

    mouth = shape[mStart:mEnd]
    mar = mouth_aspect_ratio(mouth)

    ear_values.append(ear)
    mar_values.append(mar)

ear_values = np.array(ear_values)
mar_values = np.array(mar_values)


Processing Drowsy: 100%|██████████| 17878/17878 [01:20<00:00, 220.75it/s]


In [8]:
 
EYE_AR_THRESH = np.percentile(ear_values, 10)  

 
MOUTH_AR_THRESH = np.percentile(mar_values, 90)  

print("Learned EAR threshold:", EYE_AR_THRESH)
print("Learned MAR threshold:", MOUTH_AR_THRESH)

Learned EAR threshold: 0.15488429431352235
Learned MAR threshold: 0.685159674296453


In [26]:
test_root = "../data/splitted_Data/test"
classes = ["Non Drowsy", "Drowsy"]  
SEQ_LEN = 5                
MIN_DROWSY_IN_SEQ = 3      


In [27]:

def group_frames_by_video(folder):

    video_dict = {}
    for img_name in os.listdir(folder):
        if not img_name.lower().endswith((".png", ".jpg", ".jpeg")):
            continue
        m = re.match(r"[A-Za-z]+", img_name)
        if not m:
            continue
        prefix = m.group()  
        video_dict.setdefault(prefix, []).append(os.path.join(folder, img_name))

     
    for vid in video_dict:
        video_dict[vid].sort()
    return video_dict

In [28]:

def predict_drowsiness(frame):
    """
    
    "Drowsy" 或 "Non Drowsy"
    """

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    rects = detector(gray, 0)

    if len(rects) == 0:
        return "Non Drowsy"   

    rect = rects[0]
    shape = predictor(gray, rect)
    shape = face_utils.shape_to_np(shape)

    # EAR
    leftEye = shape[lStart:lEnd]
    rightEye = shape[rStart:rEnd]
    ear = (eye_aspect_ratio(leftEye) + eye_aspect_ratio(rightEye)) / 2.0

    # MAR
    mouth = shape[mStart:mEnd]
    mar = mouth_aspect_ratio(mouth)


    # Decision rules
    if ear < EYE_AR_THRESH:
        return "Drowsy"
    if mar > MOUTH_AR_THRESH:
        return "Drowsy"
    # if head_tilt_degree[0] > HEADPOSE_THRESH:
        # return "Drowsy"

    return "Non Drowsy"

In [None]:
def evaluate_sequences_for_class(class_name):
    folder = os.path.join(test_root, class_name)
    video_dict = group_frames_by_video(folder)

    y_true_video = []
    y_pred_video = []

    y_true_frame = []
    y_pred_frame = []

    for vid, frame_paths in tqdm(video_dict.items(), desc=f"Eval {class_name} (video-level)"):
        
        frame_preds = []
        for fp in frame_paths:
            frame = cv2.imread(fp)
            pred_label = predict_drowsiness(frame)  # Only one prediction per frame
            
            frame_preds.append(pred_label)

            y_true_frame.append(class_name)
            y_pred_frame.append(pred_label)

        video_is_drowsy = False
        if len(frame_preds) < SEQ_LEN:
            drowsy_count = sum(p == "Drowsy" for p in frame_preds)
            if drowsy_count >= len(frame_preds) / 2:
                video_is_drowsy = True
        else:
            for i in range(len(frame_preds) - SEQ_LEN + 1):
                window = frame_preds[i:i+SEQ_LEN]
                if sum(p == "Drowsy" for p in window) >= MIN_DROWSY_IN_SEQ:
                    video_is_drowsy = True
                    break

        pred_label = "Drowsy" if video_is_drowsy else "Non Drowsy"

        y_true_video.append(class_name)
        y_pred_video.append(pred_label)

    return y_true_video, y_pred_video, y_true_frame, y_pred_frame


In [31]:
y_true_video_all = []
y_pred_video_all = []
y_true_frame_all = []
y_pred_frame_all = []

for cls in classes:
    yt_v, yp_v, yt_f, yp_f = evaluate_sequences_for_class(cls)
    y_true_video_all.extend(yt_v)
    y_pred_video_all.extend(yp_v)
    y_true_frame_all.extend(yt_f)
    y_pred_frame_all.extend(yp_f)

label_map = {"Non Drowsy": 0, "Drowsy": 1}


Eval Non Drowsy (video-level): 100%|██████████| 26/26 [00:04<00:00,  5.52it/s]
Eval Drowsy (video-level): 100%|██████████| 28/28 [00:05<00:00,  5.32it/s]


In [34]:
y_true_v_bin = [label_map[x] for x in y_true_video_all]
y_pred_v_bin = [label_map[x] for x in y_pred_video_all]

print("\n=== Video-level Evaluation ===")
print("Accuracy:", accuracy_score(y_true_v_bin, y_pred_v_bin))
print("F1 Score:", f1_score(y_true_v_bin, y_pred_v_bin, average="binary"))
print(classification_report(y_true_v_bin, y_pred_v_bin))
print(confusion_matrix(y_true_v_bin, y_pred_v_bin))



=== Video-level Evaluation ===
Accuracy: 0.6111111111111112
F1 Score: 0.5116279069767442
              precision    recall  f1-score   support

           0       0.56      0.85      0.68        26
           1       0.73      0.39      0.51        28

    accuracy                           0.61        54
   macro avg       0.65      0.62      0.59        54
weighted avg       0.65      0.61      0.59        54

[[22  4]
 [17 11]]


In [35]:
y_true_f_bin = [label_map[x] for x in y_true_frame_all]
y_pred_f_bin = [label_map[x] for x in y_pred_frame_all]

print("\n=== Frame-level Evaluation ===")
print("Accuracy:", accuracy_score(y_true_f_bin, y_pred_f_bin))
print("F1 Score:", f1_score(y_true_f_bin, y_pred_f_bin, average="binary"))
print(classification_report(y_true_f_bin, y_pred_f_bin))
print(confusion_matrix(y_true_f_bin, y_pred_f_bin))



=== Frame-level Evaluation ===
Accuracy: 0.48493543758967
F1 Score: 0.2463261021693492
              precision    recall  f1-score   support

           0       0.47      0.86      0.61       973
           1       0.57      0.16      0.25      1118

    accuracy                           0.48      2091
   macro avg       0.52      0.51      0.43      2091
weighted avg       0.52      0.48      0.41      2091

[[838 135]
 [942 176]]


In [21]:
for (a,b) in zip(y_true_all, y_pred_all):
    print(f"True: {a}, Pred: {b}")

True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Non Drowsy, Pred: Non Drowsy
True: Drowsy, Pred: Drowsy
True: Drowsy, Pred: Non Drowsy
True: Drowsy, Pred: Drowsy
True: Drowsy, Pred: Drowsy
True: Drowsy, Pred: Dr