<a href="https://colab.research.google.com/github/dyllanesl/AI-EDGE-Project/blob/main/ClassiferTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Download Appropriate Packages

In [None]:
!pip install tensorflow pillow
!pip install mediapipe
!pip install transformers datasets torch torchvision accelerate -U
!pip install huggingface_hub
!pip install pandas

#Load dataset

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('raulit04/Classifier-ASL2')

#Define Preprocessing Functions



In [None]:
from transformers import ViTFeatureExtractor

# Load the feature extractor
# feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
feature_extractor = ViTFeatureExtractor.from_pretrained('dyllanesl/ASL_Classifier')

# Define preprocessing function
def preprocess_function(examples):
    # Convert images to RGB if they are not already
    images = [image.convert("RGB") for image in examples['image']]
    inputs = feature_extractor(images, return_tensors="pt")
    inputs['label'] = examples['label']
    return inputs

# Apply the preprocessing function
dataset = dataset.map(preprocess_function, batched=True)

#Load pretrained model

In [None]:
from transformers import ViTForImageClassification

# Define the number of classes in your dataset
num_labels = len(dataset['train'].unique('label'))

# Load the pre-trained model with the correct number of output labels
model = ViTForImageClassification.from_pretrained('dyllanesl/ASL_Classifier', num_labels=num_labels,ignore_mismatched_sizes=True, low_cpu_mem_usage=False)



#Define training arguments and trainer & Train Model

In [None]:
from transformers import TrainingArguments, Trainer
import torch
# Define training arguments with output directory specified
training_args = TrainingArguments(
    output_dir='./vision_transformer_model_progress',  # output directory
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define data collator
def data_collator(features):
    # Convert pixel_values to tensors if they are not already
    pixel_values = torch.stack([torch.tensor(f['pixel_values']) for f in features])  # Convert to tensors before stacking

    # Handle string labels (assuming they are class names)
    labels = torch.tensor([label_to_id[f['label']] for f in features])  # Map labels to integers

    return {'pixel_values': pixel_values, 'labels': labels}

# Create a mapping from labels to IDs
label_list = dataset['train'].unique('label')  # Get unique labels
label_to_id = {label: idx for idx, label in enumerate(label_list)}


# Split the dataset into train and validation
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the final model
trainer.save_model("./vision_transformer_model_progress")

#Evaluate the Model

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

#Test Model
NOTE: During actual employment of project, make sure to integrate image taking in a loop

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
    js = Javascript('''
        async function takePhoto(quality) {
            const div = document.createElement('div');
            const capture = document.createElement('button');
            capture.textContent = 'Capture';
            div.appendChild(capture);

            const video = document.createElement('video');
            video.style.display = 'block';
            const stream = await navigator.mediaDevices.getUserMedia({video: true});

            document.body.appendChild(div);
            div.appendChild(video);
            video.srcObject = stream;
            await video.play();

            google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

            await new Promise((resolve) => capture.onclick = resolve);

            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            stream.getVideoTracks()[0].stop();
            div.remove();
            return canvas.toDataURL('image/jpeg', quality);
        }
    ''')
    display(js)
    data = eval_js('takePhoto({})'.format(quality))
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
        f.write(binary)
    return filename

# Capture photo
photo_filename = take_photo()

In [24]:
import cv2
import mediapipe as mp
import numpy as np

MARGIN = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # Define HANDEDNESS_TEXT_COLOR

# Function to draw landmarks on an image
def draw_landmarks_on_image(rgb_image, detection_result):
    hand_landmarks_list = detection_result.multi_hand_landmarks
    handedness_list = detection_result.multi_handedness
    annotated_image = np.copy(rgb_image)

    for idx in range(len(hand_landmarks_list)):
        hand_landmarks = hand_landmarks_list[idx]
        handedness = handedness_list[idx]

        mp.solutions.drawing_utils.draw_landmarks(
            annotated_image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
            mp.solutions.drawing_styles.get_default_hand_connections_style()
        )

        height, width, _ = annotated_image.shape
        x_coordinates = [landmark.x for landmark in hand_landmarks.landmark]
        y_coordinates = [landmark.y for landmark in hand_landmarks.landmark]
        text_x = int(min(x_coordinates) * width)
        text_y = int(min(y_coordinates) * height) - MARGIN

        cv2.putText(annotated_image, f"{handedness.classification[0].label}",
                    (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                    FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

    return annotated_image

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)

# Load the captured image
image = cv2.imread(photo_filename)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Process the image to detect hand landmarks
results = hands.process(image_rgb)

# Draw landmarks on the image
if results.multi_hand_landmarks:
    annotated_image = draw_landmarks_on_image(image_rgb, results)
    annotated_image_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
    cv2.imwrite('annotated_photo.jpg', annotated_image_bgr)
else:
    annotated_image_bgr = image
    print("No hand landmarks detected.")


In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch

# Load the feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('./vision_transformer_model_progress')

# Ensure the model is in evaluation mode
model.eval()

In [26]:
from PIL import Image

# Load and preprocess the image
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")
    return inputs['pixel_values']

image_path = '/content/annotated_photo.jpg'  # Replace with the path to your image
pixel_values = preprocess_image(image_path)

In [None]:
# Make predictions
with torch.no_grad():
    outputs = model(pixel_values)

# Get predicted class label
preds = outputs.logits.argmax(-1).item()

# Map the prediction to the label
id_to_label = {v: k for k, v in label_to_id.items()}  # Assuming label_to_id is defined as in your training script
predicted_label = id_to_label[preds]

# Ensure predicted_label is a string
predicted_label_str = str(predicted_label)

print(f'Predicted label: {predicted_label_str}')

#Load CSV File and Create a Custom Dataset

#Save Model Progress

In [None]:
!pip install huggingface_hub

In [None]:
from google.colab import userdata
from huggingface_hub import login
login(token = userdata.get('ASL_Token'))

In [None]:
from huggingface_hub import HfApi, create_repo
import os

api = HfApi()
repo_id = "dyllanesl/ASL_Classifier"  # Your repository name
token = userdata.get('ASL_Token')  # Your Hugging Face token

# Create the repository if it doesn't exist
try:
    temp = api.repo_info(repo_id, repo_type="model", token=token)
    print(temp)
except RepositoryNotFoundError:
    create_repo(repo_id, repo_type="model", token=token)

# Define the path to the directory and the files
directory_path = "/content/vision_transformer_model_progress/checkpoint-6"
files = ["config.json", "model.safetensors", "optimizer.pt", "rng_state.pth",
         "scheduler.pt", "trainer_state.json", "training_args.bin"]

# Upload each file to the repository
for file_name in files:
    file_path = os.path.join(directory_path, file_name)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_name,
        repo_id=repo_id,
        repo_type="model",  # Change this to "dataset" if you're uploading to a dataset repository
        token=token,
        commit_message=f"Upload {file_name}"
    )

print("Files uploaded successfully.")