In [None]:
#%pip install mediapipe

In [None]:
import mediapipe as mp
from util import draw_landmarks_on_image, draw_letter_on_image
from image_landmarker import ImageLandmarker
from livestream_landmarker import LivestreamLandmarker
import json
import cv2 as cv
import numpy as np

In [None]:
asl_dir = "american-sign-language-letters.v1i.coco"

In [None]:
# extract dataset from zip
import zipfile
with zipfile.ZipFile(f"{asl_dir}.zip","r") as zip_ref:
    zip_ref.extractall(asl_dir)

In [None]:
labeled_images = []

with open(f'{asl_dir}/train/_annotations.coco.json') as file:
    annotation_dict = json.load(file)


image_lookup = {item["id"]: item for item in annotation_dict["images"]}
for annotation in annotation_dict["annotations"]:
    image_id = annotation["image_id"]

    if image_id in image_lookup:
        image = image_lookup[image_id]
        
        labeled_images.append([image_id, image['file_name'], annotation['category_id']])


In [None]:
letters = {}
for category in annotation_dict['categories']:
    letters[category['id']] = category['name']

In [None]:
def process_detection_result(detection_result):
    processed_data = []
    for landmarks in detection_result.hand_landmarks:
        # Extract x, y, z coordinates from each landmark
        for landmark in landmarks:
            coords = [landmark.x, landmark.y, landmark.z]
            processed_data.append(coords)
    return processed_data

In [None]:
def detect_and_add_image(X, Y, detector, image, category_id):
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
    
    detection_result = detector.detect(mp_image)
    
    # check for empty detection result
    if detection_result.hand_landmarks != []:
        X.append(process_detection_result(detection_result))
        Y.append(category_id)
    return X, Y

In [None]:
X = []
Y = []
try:
    detector = ImageLandmarker()
    for image in labeled_images:
        filename = f"{asl_dir}/train/" + image[1]
        
        rgb_image = cv.cvtColor(cv.imread(filename), cv.COLOR_BGR2RGB)
        flipped_image = cv.flip(rgb_image, 1) # flip to make it detect both hands equally
        
        X, Y = detect_and_add_image(X,Y, detector, rgb_image, image[2])
        X, Y = detect_and_add_image(X,Y, detector, flipped_image, image[2])
        
finally: 
    detector.close()

X = np.array(X)
Y = np.array(Y)

X_flat = X.reshape(X.shape[0], -1)
        

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X_flat, Y, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report


y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))

accuracy = clf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
def get_ASL_prediction(detection_result):
    letter_id = 0
    try:
        if detection_result.hand_landmarks != []:
            processed_result = process_detection_result(detection_result)
            X_flat = np.array(processed_result).reshape(1, -1)
            letter_id = clf.predict(X_flat)[0]
    except AttributeError:
        # if no landmarks detected, detection_result does not have an attribute hand_landmarks
        pass
    return letters.get(letter_id)
    

In [None]:
try:
    cap = cv.VideoCapture(0)
    detector = LivestreamLandmarker()
    if not cap.isOpened():
        print("Cannot open camera")
        exit()
    while True:
        ret, frame = cap.read()
        # if frame is read correctly ret is True
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
        detector.detect_async(mp_image)
        landmarked_image = draw_landmarks_on_image(mp_image.numpy_view(), detector.result)
        ASL_letter = get_ASL_prediction(detector.result)
        annotated_image = draw_letter_on_image(landmarked_image, ASL_letter)
        cv.imshow('frame', annotated_image)
        # 1000/100 = 100 FPS
        if cv.waitKey(100) == ord('q'):
            break
finally:    
    detector.close()
    cap.release()
    cv.destroyAllWindows()