<a href="https://colab.research.google.com/github/dyllanesl/AI-EDGE-Project/blob/main/ASL_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Restart Runtime
Quick way to restart the runtime

In [None]:
import os
os._exit(00)



#Install dependencies
Install dependices and necessary libaries

In [45]:
# Install essential libraries
!pip install torch==2.3.0 torchvision==0.18.1
!pip install diffusers transformers datasets accelerate
!pip install mediapipe opencv-python
!pip install diffusers datasets transformers accelerate

# Download the Mediapipe hand landmarker model
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task


Collecting torchvision==0.18.1
  Using cached torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl (7.0 MB)
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Cannot install torch==2.3.0 and torchvision==0.18.1 because these package versions have conflicting dependencies.[0m[31m
[0m
The conflict is caused by:
    The user requested torch==2.3.0
    torchvision 0.18.1 depends on torch==2.3.1

To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow pip attempt to solve the dependency conflict

[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m


#Load Specific pre-trained diffusion model
From hugging Face Model Hub , this provides an easy way to load and interact with pre-trained diffusion models.

In [39]:
import torch
from torch import nn
from diffusers import UNet2DModel

class ClassConditionedUnet(nn.Module):
    def __init__(self, model_name, num_classes=26, class_emb_size=4):
        super().__init__()
        self.class_emb = nn.Embedding(num_classes, class_emb_size)

        # Load the model configuration from the pre-trained model
        self.model = UNet2DModel.from_pretrained(model_name)

        # Update the in_channels to include the class embedding size
        self.model.config.in_channels = 3 + class_emb_size

        # Reinitialize the first convolutional layer to match the updated configuration
        old_conv = self.model.conv_in
        self.model.conv_in = nn.Conv2d(
            3 + class_emb_size,
            old_conv.out_channels,
            kernel_size=old_conv.kernel_size,
            stride=old_conv.stride,
            padding=old_conv.padding
        )

        # Initialize the new weights
        with torch.no_grad():
            self.model.conv_in.weight[:, :3, :, :] = old_conv.weight
            if self.model.conv_in.weight.size(1) > 3:
                nn.init.normal_(self.model.conv_in.weight[:, 3:, :, :], 0, 0.02)
            if self.model.conv_in.bias is not None:
                self.model.conv_in.bias = old_conv.bias

    def forward(self, x, t, class_labels):
        bs, ch, w, h = x.shape
        print(f"Input shape before concatenation: {x.shape}")
        class_cond = self.class_emb(class_labels).view(bs, -1, 1, 1).expand(bs, -1, w, h)
        net_input = torch.cat((x, class_cond), 1)
        print(f"Input shape after concatenation: {net_input.shape}")
        return self.model(net_input, t)


In [40]:
model_name = "google/ddpm-cifar10-32"  # Example smaller model

# Instantiate the custom class with the pre-trained model configuration
custom_model = ClassConditionedUnet(model_name)

# Move the model to GPU if available
# custom_model.to("cuda")


#Train Model Here
In this cell we upload the csv file to then use to combine with our images then push the final dataset combined to HuggingFace

In [6]:
# Convert the dataset to a pandas DataFrame
# Load the CSV file with labels
from google.colab import files
import pandas as pd

# Upload the CSV file
uploaded = files.upload()

csv_filename = list(uploaded.keys())[0]

labels_df = pd.read_csv("ASL_Sheet.csv")



Saving ASL_Sheet.csv to ASL_Sheet (1).csv


#Pull images from hugging face
Here we get our images from a current dataset that only has images to prepare to combine with the labels

In [7]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ascar17/ASL")

print(dataset)
print(dataset["train"][0])

Downloading readme: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/24 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/24 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 24
    })
})
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=130x133 at 0x7FC540379390>}


#Testing
Here we see if this was done correctly by trying to display the first image to check if this was successful, while also displaying the label column to further confirm this

In [None]:
#Import io and PIL
from io import BytesIO
from PIL import Image

#Print the first item in the dataset
image = dataset["train"][0]['image']
image

#display
labels_df['label']

#Push to Hugging Face
Here we upload our secret token, must use your own

In [125]:
from google.colab import userdata
from huggingface_hub import login
login(token = userdata.get('ASL_Token'))


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#Install datasets/Combine images and labels
Here we finally combine our images and labels through a dictonary

In [9]:
!pip install datasets

In [10]:
from datasets import Dataset
#combine the images with the labels
image_dataset = Dataset.from_dict({'image': dataset["train"]['image'], 'label': labels_df['label']})
#Display
image_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 24
})

#Final Push to huggingface
Push to hugging face new dataset to save it and have it there :)

In [141]:
#Send it to huggingface
image_dataset.push_to_hub("ascar17/ASL2")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ascar17/ASL2/commit/08e4502f9e91ff9e35adcd620dbaf6b8fac3d163', commit_message='Upload dataset', commit_description='', oid='08e4502f9e91ff9e35adcd620dbaf6b8fac3d163', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

dataset = load_dataset('ascar17/ASL2')

Downloading readme:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24 [00:00<?, ? examples/s]

In [41]:
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Define preprocessing steps
preprocess = Compose([
    Resize((128, 128)),
    ToTensor(),
    Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

def preprocess_function(example):
    try:
        image = example['image']
        image = image.convert("RGB")
        image = preprocess(image)
        label = example['label']
        label = ord(label) - ord('A')
        return {"pixel_values": image, "label": label}
    except Exception as e:
        print(f"Error processing image: {e}")
        return {"pixel_values": None, "label": None}

# Apply the preprocessing function to the dataset
processed_dataset = dataset.map(preprocess_function, remove_columns=["image"])

# Check if 'pixel_values' column exists
print(f"Processed dataset length: {len(processed_dataset['train'])}")

if len(processed_dataset['train']) > 0:
    print(processed_dataset['train'][0])

# Format the dataset for PyTorch if it contains data
if len(processed_dataset['train']) > 0:
    processed_dataset['train'].set_format(type='torch', columns=['pixel_values', 'label'])

    # Inspect the preprocessed dataset
    print(processed_dataset['train'][0])
else:
    print("Processed dataset is empty.")


Processed dataset length: 24
{'label': 0, 'pixel_values': [[[0.9764705896377563, 0.9764705896377563, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709, 0.9843137264251709,

In [43]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.cuda.amp import autocast, GradScaler
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

custom_model.to(device)  # Move model to the correct device


# Initialize Accelerator
accelerator = Accelerator()

# Initialize the optimizer
optimizer = AdamW(custom_model.parameters(), lr=1e-5)

# Reduce batch size to minimize memory usage
train_dataloader = DataLoader(processed_dataset['train'], batch_size=1, shuffle=True)

# Prepare the model, optimizer, and dataloader with Accelerator
custom_model, optimizer, train_dataloader = accelerator.prepare(custom_model, optimizer, train_dataloader)

# Number of epochs and accumulation steps
num_epochs = 3
accumulation_steps = 4  # Adjust as necessary for your hardware

for epoch in range(num_epochs):
    custom_model.train()
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(custom_model):
            images = batch["pixel_values"].to(device)
            labels = batch["label"].to(device)

            # Generate noisy images
            noise = torch.randn_like(images).to(device)
            noisy_images = images + noise * 0.1

            # Generate timesteps tensor
            timesteps = torch.randint(0, 999, (images.shape[0],), device=device)

            # Forward pass with mixed precision

            outputs = custom_model(images, t=timesteps, class_labels=labels)
            # Compute loss manually
            loss = torch.nn.functional.mse_loss(outputs.sample, noisy_images) / accumulation_steps  # Scale loss

            # Backward pass
            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Clear unused variables
            del images, noise, noisy_images, outputs, loss
            torch.cuda.empty_cache()

        # Clear cache after each step to free up memory
        torch.cuda.empty_cache()

    # Clear cache after each epoch to free up memory
    torch.cuda.empty_cache()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss/len(train_dataloader)}")

# Save the fine-tuned model
custom_model.save_pretrained("fine_tuned_class_conditioned_unet")

Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])
Input shape before concatenation: torch.Size([1, 3, 128, 128])
Input shape after concatenation: torch.Size([1, 7, 128, 128])


AttributeError: 'ClassConditionedUnet' object has no attribute 'save_pretrained'

In [183]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.cuda.amp import autocast, GradScaler
import torch

# Initialize Accelerator
accelerator = Accelerator()

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Reduce batch size to minimize memory usage
train_dataloader = DataLoader(dataset["train"], batch_size=1, shuffle=True)

# Prepare the model, optimizer, and dataloader with Accelerator
model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

# Mixed precision scaler
scaler = GradScaler()

# Number of epochs and accumulation steps
num_epochs = 3
accumulation_steps = 4  # Adjust as necessary for your hardware

for epoch in range(num_epochs):
    custom_model.train()
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            images = batch["pixel_values"].to(accelerator.device)
            labels = batch["label"].to(accelerator.device)
            # Generate noisy images
            noise = torch.randn_like(images).to(accelerator.device)
            noisy_images = images + noise * 0.1

            # Generate timesteps tensor
            #timesteps = torch.tensor([10] * noisy_images.shape[0], dtype=torch.long, device=accelerator.device)  # Example timestep value 10
            timesteps = torch.randint(0, 999, (images.shape[0],), device=accelerator.device)

            # Forward pass with mixed precision
            with autocast():
                outputs = custom_model(noisy_images, timestep=timesteps, class_labels=labels)
                # Compute loss manually
                loss = torch.nn.functional.mse_loss(outputs.sample, noisy_images) / accumulation_steps  # Scale loss

            # Backward pass with mixed precision
            scaler.scale(loss).backward()

            if (step + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            #if step % 100 == 0:
                #print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}")
                #Comment out if using CPU
                #print(torch.cuda.memory_summary())

            # Clear unused variables
            del images, noise, noisy_images, outputs, loss
            torch.cuda.empty_cache()

        # Clear cache after each step to free up memory
        torch.cuda.empty_cache()

    # Clear cache after each epoch to free up memory
    torch.cuda.empty_cache()

# Save the fine-tuned model
custom_model.save_pretrained("fine_tuned_class_conditioned_unet")

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.PngImagePlugin.PngImageFile'>

In [None]:
import torchvision

# Load the validation dataset
val_dataset = load_dataset("ascar17/ASL", split='train')
val_dataset = val_dataset.map(preprocess_function)
val_dataset.set_format(type='torch', columns=['pixel_values'])

# DataLoader for validation data
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

# Evaluation loop
model.eval()
with torch.no_grad():
    for step, batch in enumerate(val_dataloader):
        images = batch["pixel_values"].to("cuda")
        noise = torch.randn_like(images).to("cuda")
        noisy_images = images + noise * 0.1
        timesteps = torch.tensor([10] * noisy_images.shape[0], dtype=torch.long, device="cuda")

        with autocast():
            outputs = model(noisy_images, timestep=timesteps)

        # Visualize or save the outputs
        # Example: save the noisy and denoised images for comparison
        torchvision.utils.save_image(noisy_images, f'noisy_{step}.png')
        torchvision.utils.save_image(outputs.sample, f'denoised_{step}.png')

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = AdamW(pipeline.unet.parameters(), lr=1e-5)
train_dataloader = DataLoader(dataset["train"], batch_size=4, shuffle=True)

pipeline.unet, optimizer, train_dataloader = accelerator.prepare(pipeline.unet, optimizer, train_dataloader)

num_epochs = 3
for epoch in range(num_epochs):
    pipeline.unet.train()
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(pipeline.unet):
            images = batch["pixel_values"].to(accelerator.device)

            # Generate noisy images and compute the loss
            noise = torch.randn_like(images).to(accelerator.device)
            noisy_images = images + noise * 0.1
            outputs = pipeline.unet(noisy_images, noise)

            loss = outputs.loss

            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()

        if step % 100 == 0:
            print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}")

# Save the fine-tuned model
pipeline.unet.save_pretrained("fine_tuned_stable_diffusion")


In [None]:
import os
from PIL import Image
from diffusers import DiffusionPipeline
import torch

# Supported image formats
supported_formats = (".png", ".jpg", ".jpeg", ".bmp", ".gif")

# Load the existing CSV data
csv_path = os.path.join(extract_path, 'pre_ASL_dataset.csv')
df = pd.read_csv(csv_path)

# Define the output directory for new images
output_dir = "/content/generated_images"
os.makedirs(output_dir, exist_ok=True)

# Load the existing CSV data
df = pd.read_csv(pre_csv_path)

# Create a list to store new data
new_data = {
    'image_path': [],
    'prompt': [],
    'ddim_steps': [],
    'plms': [],
    'scale': [],
    'H': [],
    'W': [],
    'seed': [],
    'label': []
}

# Initialize a counter for the new image filenames
counter = 1

# Iterate over the existing data to generate new images
for index, row in df.iterrows():
    image_name = os.path.basename(row['image_path'])  # Get the image name
    label = row['label']
    ddim_steps = row['ddim_steps']
    plms = row['plms']
    scale = row['scale']
    H = row['H']
    W = row['W']
    seed = row['seed']
    input_image_path = os.path.join(dataset_path, image_name)


    # Skip non-image files and directories
    if os.path.isdir(input_image_path) or not image_name.lower().endswith(supported_formats):
        continue

    try:

        input_image = Image.open(input_image_path).convert("RGB")  # Open and convert the image to RGB
        # Set the seed for reproducibility
        generator = torch.manual_seed(seed)
        # Create a more specific prompt based on the label
        prompt = f"Generate a realistic hand gesture representing the American sign language letter {label}, with the hand clearly showing the gesture in a neutral background"

        # Generate an output image using the model
        output = pipeline(prompt=prompt, image=input_image)  # Run the image through the pipeline with a prompt
        output_image = output.images[0]  # Get the generated image from the output

        # Save the output image
        output_image_name = f"{label}{counter}.png"
        output_image_path = os.path.join(output_dir, output_image_name)  # Define the path to save the output image
        output_image.save(output_image_path)  # Save the generated image

        # Add the new data to the list
        new_data['image_path'].append(output_image_path)
        new_data['prompt'].append(prompt)
        new_data['ddim_steps'].append(50)
        new_data['plms'].append(False)
        new_data['scale'].append(7.5)
        new_data['H'].append(512)
        new_data['W'].append(512)
        new_data['seed'].append(random.randint(0, 4294967295))  # Random seed for reproducibility
        new_data['label'].append(label)

        # Increment the counter
        counter += 1

    except Exception as e:
        print(f"Error processing {input_image_path}: {e}")  # Print any errors encountered during processing

# Create a new DataFrame with the generated data
new_df = pd.DataFrame(new_data)

# Call the function to append new data to the post_ASL_dataset.csv
post_df = append_to_csv(post_csv_path, new_df)

# Display the updated DataFrame to verify
print("Updated post_ASL_dataset.csv:")
print(post_df.head())


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Updated post_ASL_dataset.csv:
                           image_path  \
0  /content/generated_images/'Q'1.png   
1  /content/generated_images/'G'2.png   
2  /content/generated_images/'K'3.png   
3  /content/generated_images/'E'4.png   
4  /content/generated_images/'Y'5.png   

                                              prompt ddim_steps   plms  scale  \
0  Generate a realistic hand gesture representing...          5  False    7.5   
1  Generate a realistic hand gesture representing...          5  False    7.5   
2  Generate a realistic hand gesture representing...          5  False    7.5   
3  Generate a realistic hand gesture representing...          5  False    7.5   
4  Generate a realistic hand gesture representing...          5  False    7.5   

     H    W        seed label  
0  512  512  4072520777   'Q'  
1  512  512   706380401   'G'  
2  512  512  1828062252   'K'  
3  512  512  3303763210   'E'  
4  512  512  3663669297   'Y'  


In [None]:
output_dir = "/content/generated_images"
generated_images = os.listdir(output_dir)
print("Generated Images:", generated_images)
import matplotlib.pyplot as plt

# Function to display an image
def display_image(image_path):
    img = Image.open(image_path)
    plt.imshow(img)
    plt.axis('off')  # Hide axes
    plt.show()

# Display a specific generated image
image_to_display = os.path.join(output_dir, generated_images[0])  # Change index to display a different image
display_image(image_to_display)


In [None]:
import matplotlib.pyplot as plt

# Display multiple images in a grid
def display_images(image_paths, cols=3, rows=3):
    fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
    for ax, img_path in zip(axes.flatten(), image_paths):
        img = Image.open(img_path)
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

# Get paths for a few generated images
image_paths = [os.path.join(output_dir, img) for img in generated_images[:9]]  # Adjust the number of images as needed
display_images(image_paths)

**Description:** Activates camera and captures images (frames) for the MediaPipe
to detect and visualize

**Goals:**
- Loop function to take a picture every 5 seconds until deactivate
  - This would be the constant changing of frames
-Implement the capturing of videos to detext ASL letters such as Z

**Current functionality:**
- Can manually take a screenshot of the image

In [None]:
#Activates camera, captures images
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

#Function to take the actual photo from code snippet
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

Description: Used to Visualize the "Hand Landmark Detection" Solution

In [None]:
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
FONT_COLOR = (88, 205, 54) # RGB formula for vibrant green
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # Define HANDEDNESS_TEXT_COLOR


def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)


  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

Description: Uploading / Collecting images to process

In [None]:
# #Import libraries
# import cv2 #OpenCV library that processes images and videos
# from google.colab.patches import cv2_imshow #allows cv2 to work in colab

# '''
# # Code for uploading an image manually
# from google.colab import files

# uploaded = files.upload()

# for filename in uploaded:
#   content = uploaded[filename]
#   with open(filename, 'wb') as f:
#     f.write(content)

# if len(uploaded.keys()):
#   IMAGE_FILE = next(iter(uploaded))
#   print('Uploaded file:', IMAGE_FILE)
# '''

# # img = cv2.imread(IMAGE_FILE)
# # cv2_imshow(img)


In [None]:
#Import libraries
import cv2 #OpenCV library that processes images and videos
from google.colab.patches import cv2_imshow #allows cv2 to work in colab
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
# import google.colab as files

def main():

  # STEP 1: Take the photo and save it
  filename = take_photo()
  print('Saved to {}'.format(filename))

  '''
  Uncomment this code to upload an image from computer and uncomment the
  files library from google.colab and change
  '''
  # uploaded = files.upload()

  # for imageName in uploaded:
  #   content = uploaded[imageName]
  #   with open(imageName, 'wb') as f:
  #     f.write(content)

  # if len(uploaded.keys()):
  #   filename = next(iter(uploaded))
  #   print('Uploaded file:', filename)

  # # prints the image you uploaded
  # img = cv2.imread(filename)
  # cv2_imshow(img)

  # STEP 2: Create an HandLandmarker object.
  base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
  options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
  detector = vision.HandLandmarker.create_from_options(options)

  # STEP 3: Load the input image.
  image = mp.Image.create_from_file(filename)

  # STEP 4: Detect hand landmarks from the input image.
  detection_result = detector.detect(image)

  # STEP 5: Process the classification result. In this case, visualize it.
  annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
  cv2_imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))


In [None]:
main()