<a href="https://colab.research.google.com/github/dyllanesl/AI-EDGE-Project/blob/main/ClassifierTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Download Appropriate Packages

In [None]:
!pip install tensorflow pillow
!pip install mediapipe
!pip install transformers datasets torch torchvision accelerate optuna -U
!pip install huggingface_hub
!pip install pandas

#Load dataset

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('raulit04/ASL_Dataset1')

#Define Preprocessing Functions



In [None]:
from datasets import load_dataset
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from torchvision import transforms
from PIL import Image
import torch

# Load the image processor
image_processor = ViTImageProcessor.from_pretrained('dyllanesl/ASL_Classifier')

# Define data augmentation transformations (without normalization)
data_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    # Normalization will be handled by the image processor
])

# Define preprocessing function
def preprocess_function(examples):
    # Convert to RGB before applying transforms
    images = [data_transforms(image.convert("RGB")) for image in examples['image']]
    # Convert the list of tensors to a single tensor for batch processing
    pixel_values = torch.stack(images)
    # Pass the preprocessed images to the image processor
    inputs = image_processor(images=pixel_values, return_tensors="pt", do_rescale=False)
    inputs['label'] = examples['label']
    return inputs

# Apply the preprocessing function
dataset = dataset.map(preprocess_function, batched=True)


#Load pretrained model

In [None]:
from transformers import ViTForImageClassification

# Define the number of classes in your dataset
num_labels = len(dataset['train'].unique('label'))

# Load the pre-trained model with the correct number of output labels
model = ViTForImageClassification.from_pretrained('dyllanesl/ASL_Classifier', num_labels=num_labels,ignore_mismatched_sizes=True, low_cpu_mem_usage=False)



#Define training arguments and trainer

In [None]:
from transformers import get_scheduler, EarlyStoppingCallback
import optuna

# Create a mapping from labels to IDs
label_list = dataset['train'].unique('label')  # Get unique labels
label_to_id = {label: idx for idx, label in enumerate(label_list)}

# Define data collator
def data_collator(features):
    pixel_values = torch.stack([torch.tensor(f['pixel_values']) for f in features])  # Convert to tensors before stacking
    labels = torch.tensor([label_to_id[f['label']] for f in features])  # Map labels to integers
    return {'pixel_values': pixel_values, 'labels': labels}

# Split the dataset into train and validation
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load the pre-trained model with the correct number of output labels
num_labels = len(dataset['train'].unique('label'))
model = ViTForImageClassification.from_pretrained('dyllanesl/ASL_Classifier', num_labels=num_labels, ignore_mismatched_sizes=True, low_cpu_mem_usage=False)

# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 8)

    training_args = TrainingArguments(
        output_dir='./vision_transformer_model_progress',
        num_train_epochs=num_train_epochs,
        evaluation_strategy='steps',  # Evaluate the model at each save step
        save_strategy='steps',        # Save checkpoints at regular intervals
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=learning_rate,
        logging_dir='./logs',
        logging_steps=10,
        save_steps=200,  # Save model every 100 steps
        eval_steps=200,  # Evaluate model every 100 steps
        load_best_model_at_end=True, #loads best version
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=True  # Enable mixed precision training
    )

    # Define learning rate scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=500,
        num_training_steps=len(train_dataset) * num_train_epochs,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        optimizers=(optimizer, lr_scheduler),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    eval_result = trainer.evaluate()

    # Save the final model
    trainer.save_model("./vision_transformer_model_progress")

    return eval_result['eval_loss']

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Print the best trial
print(f"Best trial: {study.best_trial.number}")
print(f"Best trial value (eval_loss): {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")


#Save Model (if results are improved)

#Test Model
NOTE: During actual employment of project, make sure to integrate image taking in a loop

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
    js = Javascript('''
        async function takePhoto(quality) {
            const div = document.createElement('div');
            const capture = document.createElement('button');
            capture.textContent = 'Capture';
            div.appendChild(capture);

            const video = document.createElement('video');
            video.style.display = 'block';
            const stream = await navigator.mediaDevices.getUserMedia({video: true});

            document.body.appendChild(div);
            div.appendChild(video);
            video.srcObject = stream;
            await video.play();

            google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

            await new Promise((resolve) => capture.onclick = resolve);

            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            stream.getVideoTracks()[0].stop();
            div.remove();
            return canvas.toDataURL('image/jpeg', quality);
        }
    ''')
    display(js)
    data = eval_js('takePhoto({})'.format(quality))
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
        f.write(binary)
    return filename

# Capture photo
photo_filename = take_photo()

In [None]:
import cv2
import mediapipe as mp
import numpy as np

MARGIN = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # Define HANDEDNESS_TEXT_COLOR

# Function to draw landmarks on an image
def draw_landmarks_on_image(rgb_image, detection_result):
    hand_landmarks_list = detection_result.multi_hand_landmarks
    handedness_list = detection_result.multi_handedness
    annotated_image = np.copy(rgb_image)

    for idx in range(len(hand_landmarks_list)):
        hand_landmarks = hand_landmarks_list[idx]
        handedness = handedness_list[idx]

        mp.solutions.drawing_utils.draw_landmarks(
            annotated_image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
            mp.solutions.drawing_styles.get_default_hand_connections_style()
        )

        height, width, _ = annotated_image.shape
        x_coordinates = [landmark.x for landmark in hand_landmarks.landmark]
        y_coordinates = [landmark.y for landmark in hand_landmarks.landmark]
        text_x = int(min(x_coordinates) * width)
        text_y = int(min(y_coordinates) * height) - MARGIN

        cv2.putText(annotated_image, f"{handedness.classification[0].label}",
                    (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                    FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

    return annotated_image

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)

# Load the captured image
image = cv2.imread(photo_filename)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Process the image to detect hand landmarks
results = hands.process(image_rgb)

# Draw landmarks on the image
if results.multi_hand_landmarks:
    annotated_image = draw_landmarks_on_image(image_rgb, results)
    annotated_image_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
    cv2.imwrite('annotated_photo.jpg', annotated_image_bgr)
else:
    annotated_image_bgr = image
    print("No hand landmarks detected.")


In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch

# Load the feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('./vision_transformer_model_progress/checkpoint-198')

# Ensure the model is in evaluation mode
model.eval()

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
from PIL import Image

# Load and preprocess the image
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")
    return inputs['pixel_values']

image_path = '/content/annotated_photo.jpg'  # Replace with the path to your image
pixel_values = preprocess_image(image_path)

In [None]:
# Make predictions
with torch.no_grad():
    outputs = model(pixel_values)

# Get predicted class label
preds = outputs.logits.argmax(-1).item()

# Map the prediction to the label
id_to_label = {v: k for k, v in label_to_id.items()}  # Assuming label_to_id is defined as in your training script
predicted_label = id_to_label[preds]

# Ensure predicted_label is a string
predicted_label_str = str(predicted_label)

print(f'Predicted label: {predicted_label_str}')

#Load CSV File and Create a Custom Dataset

#Save Model Progress

In [None]:
!pip install huggingface_hub

In [None]:
from google.colab import userdata
from huggingface_hub import login
login(token = userdata.get('ASL_Token'))

In [None]:
from huggingface_hub import HfApi, create_repo
import os

api = HfApi()
repo_id = "dyllanesl/ASL_Classifier"  # Your repository name
token = userdata.get('ASL_Token')  # Your Hugging Face token

# Create the repository if it doesn't exist
try:
    temp = api.repo_info(repo_id, repo_type="model", token=token)
    print(temp)
except RepositoryNotFoundError:
    create_repo(repo_id, repo_type="model", token=token)

# Define the path to the directory and the files
directory_path = "/content/vision_transformer_model_progress/checkpoint-198"
files = ["config.json", "model.safetensors", "optimizer.pt", "rng_state.pth",
         "scheduler.pt", "trainer_state.json", "training_args.bin"]

# Upload each file to the repository
for file_name in files:
    file_path = os.path.join(directory_path, file_name)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_name,
        repo_id=repo_id,
        repo_type="model",  # Change this to "dataset" if you're uploading to a dataset repository
        token=token,
        commit_message=f"Upload {file_name}"
    )

print("Files uploaded successfully.")