<a href="https://colab.research.google.com/github/dyllanesl/AI-EDGE-Project/blob/main/ASL_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Restart Runtime
Quick way to restart the runtime

In [None]:
import os
os._exit(00)



#Install dependencies
Install dependices and necessary libaries

In [None]:
# Install essential libraries
!pip install torch==2.3.0 torchvision==0.18.1
!pip install diffusers transformers datasets accelerate
!pip install mediapipe opencv-python
!pip install diffusers datasets transformers accelerate

# Download the Mediapipe hand landmarker model
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task


#Load Specific pre-trained diffusion model
From hugging Face Model Hub , this provides an easy way to load and interact with pre-trained diffusion models.

In [11]:
import torch
from torch import nn
from diffusers import UNet2DModel

class ClassConditionedUnet(nn.Module):
    def __init__(self, model_name, num_classes=26, class_emb_size=4):
        super().__init__()
        self.class_emb = nn.Embedding(num_classes, class_emb_size)

        # Load the model configuration from the pre-trained model
        self.model = UNet2DModel.from_pretrained(model_name)

        # Update the in_channels to include the class embedding size
        self.model.config.in_channels = 3 + class_emb_size

        # Reinitialize the first convolutional layer to match the updated configuration
        old_conv = self.model.conv_in
        self.model.conv_in = nn.Conv2d(
            3 + class_emb_size,
            old_conv.out_channels,
            kernel_size=old_conv.kernel_size,
            stride=old_conv.stride,
            padding=old_conv.padding
        )

        # Initialize the new weights
        with torch.no_grad():
            self.model.conv_in.weight[:, :3, :, :] = old_conv.weight
            if self.model.conv_in.weight.size(1) > 3:
                nn.init.normal_(self.model.conv_in.weight[:, 3:, :, :], 0, 0.02)
            if self.model.conv_in.bias is not None:
                self.model.conv_in.bias = old_conv.bias

    def forward(self, x, t, class_labels):
        bs, ch, w, h = x.shape
        #print(f"Input shape before concatenation: {x.shape}")
        class_cond = self.class_emb(class_labels).view(bs, -1, 1, 1).expand(bs, -1, w, h)
        net_input = torch.cat((x, class_cond), 1)
        #print(f"Input shape after concatenation: {net_input.shape}")
        return self.model(net_input, t)

    def save_pretrained(self, save_directory):
        # Save the underlying model
        self.model.save_pretrained(save_directory)
        # Save the class embedding layer
        torch.save(self.class_emb.state_dict(), f"{save_directory}/class_emb.pth")

    @classmethod
    def from_pretrained(cls, model_name, save_directory, num_classes=26, class_emb_size=4):
        # Load the underlying model
        model = cls(model_name, num_classes, class_emb_size)
        model.model = UNet2DModel.from_pretrained(save_directory)
        # Load the class embedding layer
        model.class_emb.load_state_dict(torch.load(f"{save_directory}/class_emb.pth"))
        return model

In [3]:
model_name = "google/ddpm-cifar10-32"  # Example smaller model

# Instantiate the custom class with the pre-trained model configuration
custom_model = ClassConditionedUnet(model_name)

# Move the model to GPU if available
custom_model.to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/143M [00:00<?, ?B/s]

ClassConditionedUnet(
  (class_emb): Embedding(26, 4)
  (model): UNet2DModel(
    (conv_in): Conv2d(7, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (time_proj): Timesteps()
    (time_embedding): TimestepEmbedding(
      (linear_1): Linear(in_features=128, out_features=512, bias=True)
      (act): SiLU()
      (linear_2): Linear(in_features=512, out_features=512, bias=True)
    )
    (down_blocks): ModuleList(
      (0): DownBlock2D(
        (resnets): ModuleList(
          (0-1): 2 x ResnetBlock2D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (time_emb_proj): Linear(in_features=512, out_features=128, bias=True)
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (nonlinearity): SiLU()
     

#Train Model Here
In this cell we upload the csv file to then use to combine with our images then push the final dataset combined to HuggingFace

In [4]:
# Convert the dataset to a pandas DataFrame
# Load the CSV file with labels
from google.colab import files
import pandas as pd

# Upload the CSV file
uploaded = files.upload()

csv_filename = list(uploaded.keys())[0]

labels_df = pd.read_csv("ASL_Sheet.csv")



Saving ASL_Sheet.csv to ASL_Sheet.csv


#Pull images from hugging face
Here we get our images from a current dataset that only has images to prepare to combine with the labels

In [5]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ascar17/ASL")

print(dataset)
print(dataset["train"][0])

Downloading readme: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/24 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/24 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 24
    })
})
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=130x133 at 0x7C134AA91AE0>}


#Testing
Here we see if this was done correctly by trying to display the first image to check if this was successful, while also displaying the label column to further confirm this

In [18]:
#Import io and PIL
from io import BytesIO
from PIL import Image

#Print the first item in the dataset
image = dataset["train"][0]['image']
image

#display
labels_df['label']

0     A
1     B
2     C
3     D
4     E
5     F
6     G
7     H
8     I
9     K
10    L
11    M
12    N
13    O
14    P
15    Q
16    R
17    S
18    T
19    U
20    V
21    W
22    X
23    Y
Name: label, dtype: object

#Push to Hugging Face
Here we upload our secret token, must use your own

In [19]:
from google.colab import userdata
from huggingface_hub import login
login(token = userdata.get('ASL_Token'))


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#Install datasets/Combine images and labels
Here we finally combine our images and labels through a dictonary

In [6]:
!pip install datasets



In [7]:
from datasets import Dataset
#combine the images with the labels
image_dataset = Dataset.from_dict({'image': dataset["train"]['image'], 'label': labels_df['label']})
#Display
image_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 24
})

#Final Push to huggingface
Push to hugging face new dataset to save it and have it there :)

In [None]:
#Send it to huggingface
image_dataset.push_to_hub("ascar17/ASL2")

In [8]:
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

dataset = load_dataset('ascar17/ASL2')

Downloading readme:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24 [00:00<?, ? examples/s]

In [9]:
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Define preprocessing steps
preprocess = Compose([
    Resize((128, 128)),
    ToTensor(),
    Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

def preprocess_function(example):
    try:
        image = example['image']
        image = image.convert("RGB")
        image = preprocess(image)
        label = example['label']
        label = ord(label) - ord('A')
        return {"pixel_values": image, "label": label}
    except Exception as e:
        print(f"Error processing image: {e}")
        return {"pixel_values": None, "label": None}

# Apply the preprocessing function to the dataset
processed_dataset = dataset.map(preprocess_function, remove_columns=["image"])

# Check if 'pixel_values' column exists
print(f"Processed dataset length: {len(processed_dataset['train'])}")

if len(processed_dataset['train']) > 0:
    print(processed_dataset['train'][0])

# Format the dataset for PyTorch if it contains data
if len(processed_dataset['train']) > 0:
    processed_dataset['train'].set_format(type='torch', columns=['pixel_values', 'label'])

    # Inspect the preprocessed dataset
    print(processed_dataset['train'][0])
else:
    print("Processed dataset is empty.")


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Processed dataset length: 24
{'label': 0, 'pixel_values': [[[0.9764705896377563, 0.9764705896377563, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709,

In [10]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.cuda.amp import autocast, GradScaler
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

custom_model.to(device)  # Move model to the correct device


# Initialize Accelerator
accelerator = Accelerator()

# Initialize the optimizer
optimizer = AdamW(custom_model.parameters(), lr=1e-5)

# Reduce batch size to minimize memory usage
train_dataloader = DataLoader(processed_dataset['train'], batch_size=1, shuffle=True)

# Prepare the model, optimizer, and dataloader with Accelerator
custom_model, optimizer, train_dataloader = accelerator.prepare(custom_model, optimizer, train_dataloader)

# Number of epochs and accumulation steps
num_epochs = 3
accumulation_steps = 4  # Adjust as necessary for your hardware

for epoch in range(num_epochs):
    custom_model.train()
    epoch_loss = 0
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(custom_model):
            images = batch["pixel_values"].to(device)
            labels = batch["label"].to(device)

            # Generate noisy images
            noise = torch.randn_like(images).to(device)
            noisy_images = images + noise * 0.1

            # Generate timesteps tensor
            timesteps = torch.randint(0, 999, (images.shape[0],), device=device)

            # Forward pass with mixed precision

            outputs = custom_model(images, t=timesteps, class_labels=labels)
            # Compute loss manually
            loss = torch.nn.functional.mse_loss(outputs.sample, noisy_images) / accumulation_steps  # Scale loss

            # Backward pass
            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            epoch_loss += loss.item()

            # Clear unused variables
            del images, noise, noisy_images, outputs, loss
            torch.cuda.empty_cache()

        # Clear cache after each step to free up memory
        torch.cuda.empty_cache()

    # Clear cache after each epoch to free up memory
    torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss/len(train_dataloader)}")

# Save the fine-tuned model
custom_model.save_pretrained("fine_tuned_class_conditioned_unet")

Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])


#Display the noisy/denoisy data
Work on
Take into account label only is taking into account image atm

In [12]:
import torchvision

# Load the validation dataset
val_dataset = load_dataset("ascar17/ASL", split='train')
val_dataset = val_dataset.map(preprocess_function)
val_dataset.set_format(type='torch', columns=['pixel_values'])

# DataLoader for validation data
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

# Evaluation loop
custom_model.eval()
with torch.no_grad():
    for step, batch in enumerate(val_dataloader):
        images = batch["pixel_values"].to("cuda")
        noise = torch.randn_like(images).to("cuda")
        noisy_images = images + noise * 0.1
        timesteps = torch.tensor([10] * noisy_images.shape[0], dtype=torch.long, device="cuda")

        with autocast():
            outputs = model(noisy_images, timestep=timesteps)

        # Visualize or save the outputs
        # Example: save the noisy and denoised images for comparison
        torchvision.utils.save_image(noisy_images, f'noisy_{step}.png')
        torchvision.utils.save_image(outputs.sample, f'denoised_{step}.png')

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'
Error processing image: 'label'


NameError: name 'model' is not defined

#Single Image generation
dont run untill above complete



In [None]:
output_dir = "/content/generated_images"
generated_images = os.listdir(output_dir)
print("Generated Images:", generated_images)
import matplotlib.pyplot as plt

# Function to display an image
def display_image(image_path):
    img = Image.open(image_path)
    plt.imshow(img)
    plt.axis('off')  # Hide axes
    plt.show()

# Display a specific generated image
image_to_display = os.path.join(output_dir, generated_images[0])  # Change index to display a different image
display_image(image_to_display)


Multiple image geneation
dont run ethier atm

In [None]:
import matplotlib.pyplot as plt

# Display multiple images in a grid
def display_images(image_paths, cols=3, rows=3):
    fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
    for ax, img_path in zip(axes.flatten(), image_paths):
        img = Image.open(img_path)
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

# Get paths for a few generated images
image_paths = [os.path.join(output_dir, img) for img in generated_images[:9]]  # Adjust the number of images as needed
display_images(image_paths)

##MediaPipe begins

**Description:** Activates camera and captures images (frames) for the MediaPipe
to detect and visualize

**Goals:**
- Loop function to take a picture every 5 seconds until deactivate
  - This would be the constant changing of frames
-Implement the capturing of videos to detext ASL letters such as Z

**Current functionality:**
- Can manually take a screenshot of the image

In [None]:
#Activates camera, captures images
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

#Function to take the actual photo from code snippet
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

Description: Used to Visualize the "Hand Landmark Detection" Solution

In [None]:
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
FONT_COLOR = (88, 205, 54) # RGB formula for vibrant green
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # Define HANDEDNESS_TEXT_COLOR


def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)


  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

Description: Uploading / Collecting images to process

In [None]:
# #Import libraries
# import cv2 #OpenCV library that processes images and videos
# from google.colab.patches import cv2_imshow #allows cv2 to work in colab

# '''
# # Code for uploading an image manually
# from google.colab import files

# uploaded = files.upload()

# for filename in uploaded:
#   content = uploaded[filename]
#   with open(filename, 'wb') as f:
#     f.write(content)

# if len(uploaded.keys()):
#   IMAGE_FILE = next(iter(uploaded))
#   print('Uploaded file:', IMAGE_FILE)
# '''

# # img = cv2.imread(IMAGE_FILE)
# # cv2_imshow(img)


In [None]:
#Import libraries
import cv2 #OpenCV library that processes images and videos
from google.colab.patches import cv2_imshow #allows cv2 to work in colab
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
# import google.colab as files

def main():

  # STEP 1: Take the photo and save it
  filename = take_photo()
  print('Saved to {}'.format(filename))

  '''
  Uncomment this code to upload an image from computer and uncomment the
  files library from google.colab and change
  '''
  # uploaded = files.upload()

  # for imageName in uploaded:
  #   content = uploaded[imageName]
  #   with open(imageName, 'wb') as f:
  #     f.write(content)

  # if len(uploaded.keys()):
  #   filename = next(iter(uploaded))
  #   print('Uploaded file:', filename)

  # # prints the image you uploaded
  # img = cv2.imread(filename)
  # cv2_imshow(img)

  # STEP 2: Create an HandLandmarker object.
  base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
  options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
  detector = vision.HandLandmarker.create_from_options(options)

  # STEP 3: Load the input image.
  image = mp.Image.create_from_file(filename)

  # STEP 4: Detect hand landmarks from the input image.
  detection_result = detector.detect(image)

  # STEP 5: Process the classification result. In this case, visualize it.
  annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
  cv2_imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))


In [None]:
main()