In [1]:
# correct code

import cv2
import numpy as np
import math
import os
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk
import pyttsx3
from cvzone.HandTrackingModule import HandDetector
import mediapipe as mp
from tensorflow.keras.models import load_model

# ===== Constants =====
IMG_SIZE = 100
OFFSET = 20
EXPECTED_LANDMARK_SIZE = 225  # For Words model (75 landmarks * 3 values)

# ===== Global Variables =====
current_model = None
model = None
labels = []
cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
recorded_gestures = []
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Slower speech rate

# Initialize MediaPipe Holistic (for Words model)
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# ===== GUI Setup =====
root = tk.Tk()
root.title("ASL Recognition System")
root.geometry("1000x800")

# Webcam display
canvas = tk.Canvas(root, width=640, height=480)
canvas.pack(pady=10)

# Recognized gesture text box
text_box = tk.Entry(root, font=('Arial', 24), width=20)
text_box.pack(pady=10)

# Recorded gestures label
recorded_label = tk.Label(root, text="Recorded Gestures:", font=('Arial', 14))
recorded_label.pack(pady=5)
recorded_text = tk.Label(root, text="", font=('Arial', 14), wraplength=300, height=4, relief="solid")
recorded_text.pack(pady=5)

# Button frame
button_frame = tk.Frame(root)
button_frame.pack(pady=10)

# Mode switch frame
mode_frame = tk.Frame(root)
mode_frame.pack(pady=10)

# ===== Core Functions =====
def load_model_cnn(model_path, label_list):
    """Load CNN model for Alphabets/Numbers"""
    global model, labels, current_model
    if os.path.exists(model_path):
        model = load_model(model_path)
        labels = label_list
        current_model = "cnn"
        messagebox.showinfo("Model Loaded", f"Switched to {'Alphabets' if label_list[0].isalpha() else 'Numbers'} mode")
    else:
        messagebox.showerror("Error", "Model file not found!")

def load_model_mlp(model_path, label_list):
    """Load MLP model for Words"""
    global model, labels, current_model
    if os.path.exists(model_path):
        model = load_model(model_path)
        labels = label_list
        current_model = "mlp"
        messagebox.showinfo("Model Loaded", "Switched to Words mode")
    else:
        messagebox.showerror("Error", "Model file not found!")

def predict_cnn(img):
    """Predict using CNN (Alphabets/Numbers) with bounding box"""
    hands, img = detector.findHands(img)
    prediction = ""
    bbox = None

    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']
        bbox = (x, y, w, h)
        img_white = np.ones((IMG_SIZE, IMG_SIZE, 3), np.uint8) * 255
        img_crop = img[max(0, y - OFFSET):min(y + h + OFFSET, img.shape[0]),
                       max(0, x - OFFSET):min(x + w + OFFSET, img.shape[1])]
        try:
            aspect_ratio = h / w
            if aspect_ratio > 1:
                k = IMG_SIZE / h
                w_cal = math.ceil(k * w)
                img_resize = cv2.resize(img_crop, (w_cal, IMG_SIZE))
                w_gap = math.ceil((IMG_SIZE - w_cal) / 2)
                img_white[:, w_gap:w_gap + w_cal] = img_resize
            else:
                k = IMG_SIZE / w
                h_cal = math.ceil(k * h)
                img_resize = cv2.resize(img_crop, (IMG_SIZE, h_cal))
                h_gap = math.ceil((IMG_SIZE - h_cal) / 2)
                img_white[h_gap:h_gap + h_cal, :] = img_resize

            img_white = np.expand_dims(img_white / 255.0, axis=0)
            predictions = model.predict(img_white)
            predicted_class = np.argmax(predictions)
            prediction = labels[predicted_class]
        except Exception as e:
            print("Prediction error:", e)

    return prediction, bbox


def predict_mlp(img):
    """Predict using MLP (Words)"""
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = holistic.process(img_rgb)
    
    landmarks = []
    if results.pose_landmarks:
        for landmark in results.pose_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    if results.left_hand_landmarks:
        for landmark in results.left_hand_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    if results.right_hand_landmarks:
        for landmark in results.right_hand_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    
    prediction = ""
    if len(landmarks) > 0:
        landmarks = np.array(landmarks)
        if len(landmarks) < EXPECTED_LANDMARK_SIZE:
            landmarks = np.pad(landmarks, (0, EXPECTED_LANDMARK_SIZE - len(landmarks)))
        elif len(landmarks) > EXPECTED_LANDMARK_SIZE:
            landmarks = landmarks[:EXPECTED_LANDMARK_SIZE]
        
        predictions = model.predict(landmarks.reshape(1, -1))
        predicted_class = np.argmax(predictions)
        prediction = labels[predicted_class]
    
    return prediction

def update_frame():
    """Update webcam feed and predictions"""
    global recorded_gestures
    
    success, img = cap.read()
    if not success:
        return
    
    img_output = img.copy()
    prediction = ""
    mode_text = ""
    
    if current_model == "cnn":
        prediction, bbox = predict_cnn(img)
        mode_text = f"Mode: {'Alphabets' if labels[0].isalpha() else 'Numbers'}"
        
        # Draw bounding box and prediction if hand detected
        if bbox:
            x, y, w, h = bbox
            cv2.rectangle(img_output, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(img_output, prediction, (x, y - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
    elif current_model == "mlp":
        prediction = predict_mlp(img)
        mode_text = "Mode: Words"
    
    # Display mode and update text box
    cv2.putText(img_output, mode_text, (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    if prediction:
        text_box.delete(0, tk.END)
        text_box.insert(0, prediction)
    
    # Display webcam feed
    img_output = cv2.cvtColor(img_output, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_output)
    img_tk = ImageTk.PhotoImage(image=img_pil)
    canvas.create_image(0, 0, anchor=tk.NW, image=img_tk)
    canvas.img_tk = img_tk
    
    root.after(10, update_frame)

# ===== GUI Components =====
# Control buttons
tk.Button(button_frame, text="Record Gesture", font=('Arial', 14), 
          command=lambda: recorded_gestures.append(text_box.get()) or update_recorded_text(), 
          bg="lightblue").pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Speak", font=('Arial', 14), 
          command=lambda: engine.say(" ".join(recorded_gestures)) or engine.runAndWait(), 
          bg="lightgreen").pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Clear", font=('Arial', 14), 
          command=lambda: recorded_gestures.clear() or update_recorded_text(), 
          bg="salmon").pack(side=tk.LEFT, padx=5)

# Model switch buttons
tk.Button(mode_frame, text="Alphabets", font=('Arial', 14), 
          command=lambda: load_model_cnn('asl_alphabet_model.h5', list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)
tk.Button(mode_frame, text="Numbers", font=('Arial', 14), 
          command=lambda: load_model_cnn('asl_numeric_model.h5', list("0123456789")), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)
tk.Button(mode_frame, text="Words", font=('Arial', 14), 
          command=lambda: load_model_mlp('best_asl_model2.keras', [
              'Bathroom', 'Call', 'Done', 'Drink', 'Eat', 'Father', 'Friend', 
              'Good', 'Hello', 'Help', 'I', 'I love you', 'Later', 'More', 
              'Morning', 'Mother', 'Need', 'Night', 'No', 'Ok', 'Pain', 'Peace',
              'Please', 'Pray', 'Repeat', 'See', 'Silence', 'Sorry', 'Stop', 
              'Thank you', 'Want', 'Water', 'What', 'When', 'Where', 'Who', 
              'Why', 'Yes', 'You']), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)

def update_recorded_text():
    """Update the recorded gestures display"""
    recorded_text.config(text=" ".join(recorded_gestures))

# Initialize with Alphabets model by default
load_model_cnn('asl_alphabet_model.h5', list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))

# Start the frame update loop
update_frame()

# Close handler
def on_closing():
    cap.release()
    root.destroy()

root.protocol("WM_DELETE_WINDOW", on_closing)
root.mainloop()

In [None]:
import cv2
import numpy as np
import math
import os
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk
import pyttsx3
from cvzone.HandTrackingModule import HandDetector
import mediapipe as mp
from tensorflow.keras.models import load_model

# ===== Constants =====
IMG_SIZE = 100
OFFSET = 20
EXPECTED_LANDMARK_SIZE = 225  # For Words model (75 landmarks * 3 values)

# ===== Global Variables =====
current_model = None
model = None
labels = []
cap = cv2.VideoCapture(0)
detector = HandDetector(maxHands=1)
recorded_gestures = []
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Slower speech rate

# Initialize MediaPipe Holistic (for Words model)
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# ===== GUI Setup =====
root = tk.Tk()
root.title("ASL Recognition System")
root.geometry("1000x800")

# Webcam display
canvas = tk.Canvas(root, width=640, height=480)
canvas.pack(pady=10)

# Recognized gesture text box
text_box = tk.Entry(root, font=('Arial', 24), width=20)
text_box.pack(pady=10)

# Recorded gestures label
recorded_label = tk.Label(root, text="Recorded Gestures:", font=('Arial', 14))
recorded_label.pack(pady=5)
recorded_text = tk.Label(root, text="", font=('Arial', 14), wraplength=300, height=4, relief="solid")
recorded_text.pack(pady=5)

# Button frame
button_frame = tk.Frame(root)
button_frame.pack(pady=10)

# Mode switch frame
mode_frame = tk.Frame(root)
mode_frame.pack(pady=10)

# ===== Core Functions =====
def load_model_cnn(model_path, label_list):
    """Load CNN model for Alphabets/Numbers"""
    global model, labels, current_model
    if os.path.exists(model_path):
        model = load_model(model_path)
        labels = label_list
        current_model = "cnn"
        messagebox.showinfo("Model Loaded", f"Switched to {'Alphabets' if label_list[0].isalpha() else 'Numbers'} mode")
    else:
        messagebox.showerror("Error", "Model file not found!")

def load_model_mlp(model_path, label_list):
    """Load MLP model for Words"""
    global model, labels, current_model
    if os.path.exists(model_path):
        model = load_model(model_path)
        labels = label_list
        current_model = "mlp"
        messagebox.showinfo("Model Loaded", "Switched to Words mode")
    else:
        messagebox.showerror("Error", "Model file not found!")

def predict_cnn(img):
    """Predict using CNN (Alphabets/Numbers) with bounding box"""
    hands, img = detector.findHands(img, draw=False)  # We'll draw the box ourselves
    prediction = ""
    bbox = None
    
    if hands:
        hand = hands[0]
        x, y, w, h = hand['bbox']
        bbox = (x, y, w, h)
        
        img_crop = img[max(0, y-OFFSET):min(y+h+OFFSET, img.shape[0]),
                      max(0, x-OFFSET):min(x+w+OFFSET, img.shape[1])]
        
        img_white = np.ones((IMG_SIZE, IMG_SIZE, 3), np.uint8) * 255
        aspect_ratio = h / w
        
        try:
            if aspect_ratio > 1:
                k = IMG_SIZE / h
                w_cal = math.ceil(k * w)
                img_resize = cv2.resize(img_crop, (w_cal, IMG_SIZE))
                w_gap = math.ceil((IMG_SIZE - w_cal) / 2)
                img_white[:, w_gap:w_gap+w_cal] = img_resize
            else:
                k = IMG_SIZE / w
                h_cal = math.ceil(k * h)
                img_resize = cv2.resize(img_crop, (IMG_SIZE, h_cal))
                h_gap = math.ceil((IMG_SIZE - h_cal) / 2)
                img_white[h_gap:h_gap+h_cal, :] = img_resize
            
            img_white = np.expand_dims(img_white / 255.0, axis=0)
            predictions = model.predict(img_white)
            predicted_class = np.argmax(predictions)
            prediction = labels[predicted_class]
        except Exception as e:
            print("Prediction error:", e)
    
    return prediction, bbox

def predict_mlp(img):
    """Predict using MLP (Words)"""
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = holistic.process(img_rgb)
    
    landmarks = []
    if results.pose_landmarks:
        for landmark in results.pose_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    if results.left_hand_landmarks:
        for landmark in results.left_hand_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    if results.right_hand_landmarks:
        for landmark in results.right_hand_landmarks.landmark:
            landmarks.extend([landmark.x, landmark.y, landmark.z])
    
    prediction = ""
    if len(landmarks) > 0:
        landmarks = np.array(landmarks)
        if len(landmarks) < EXPECTED_LANDMARK_SIZE:
            landmarks = np.pad(landmarks, (0, EXPECTED_LANDMARK_SIZE - len(landmarks)))
        elif len(landmarks) > EXPECTED_LANDMARK_SIZE:
            landmarks = landmarks[:EXPECTED_LANDMARK_SIZE]
        
        predictions = model.predict(landmarks.reshape(1, -1))
        predicted_class = np.argmax(predictions)
        prediction = labels[predicted_class]
    
    return prediction

def update_frame():
    """Update webcam feed and predictions"""
    global recorded_gestures
    
    success, img = cap.read()
    if not success:
        return
    
    img_output = img.copy()
    prediction = ""
    mode_text = ""
    
    if current_model == "cnn":
        prediction, bbox = predict_cnn(img)
        mode_text = f"Mode: {'Alphabets' if labels[0].isalpha() else 'Numbers'}"
        
        # Draw bounding box and prediction if hand detected
        if bbox:
            x, y, w, h = bbox
            cv2.rectangle(img_output, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(img_output, prediction, (x, y - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
    elif current_model == "mlp":
        prediction = predict_mlp(img)
        mode_text = "Mode: Words"
    
    # Display mode and update text box
    cv2.putText(img_output, mode_text, (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    if prediction:
        text_box.delete(0, tk.END)
        text_box.insert(0, prediction)
    
    # Display webcam feed
    img_output = cv2.cvtColor(img_output, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_output)
    img_tk = ImageTk.PhotoImage(image=img_pil)
    canvas.create_image(0, 0, anchor=tk.NW, image=img_tk)
    canvas.img_tk = img_tk
    
    root.after(10, update_frame)

# ===== GUI Components =====
# Control buttons
tk.Button(button_frame, text="Record Gesture", font=('Arial', 14), 
          command=lambda: recorded_gestures.append(text_box.get()) or update_recorded_text(), 
          bg="lightblue").pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Speak", font=('Arial', 14), 
          command=lambda: engine.say(" ".join(recorded_gestures)) or engine.runAndWait(), 
          bg="lightgreen").pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Clear", font=('Arial', 14), 
          command=lambda: recorded_gestures.clear() or update_recorded_text(), 
          bg="salmon").pack(side=tk.LEFT, padx=5)

# Model switch buttons
tk.Button(mode_frame, text="Alphabets", font=('Arial', 14), 
          command=lambda: load_model_cnn('alphabet_model.h5', list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)
tk.Button(mode_frame, text="Numbers", font=('Arial', 14), 
          command=lambda: load_model_cnn('number_model.h5', [str(i) for i in range(10)]), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)
tk.Button(mode_frame, text="Words", font=('Arial', 14), 
          command=lambda: load_model_mlp('bothHands.h5',['Am','Done','Good','Help','I','Later','Morning','Night','Ok','Pray','See','Want','Water','When','You']), 
          bg="lightyellow").pack(side=tk.LEFT, padx=5)

def update_recorded_text():
    recorded_text.config(text=" ".join(recorded_gestures))

# Initialize with Alphabets model by default
load_model_mlp('bothHands.h5', ['Am','Done','Good','Help','I','Later','Morning','Night','Ok','Pray','See','Want','Water','When','You'])

# Start the frame update loop
update_frame()

# Close handler
def on_closing():
    cap.release()
    root.destroy()

root.protocol("WM_DELETE_WINDOW", on_closing)
root.mainloop()

