In [1]:
import os
import yaml
import json
import wandb
import torch
import shutil
import torch.nn.functional as F
import torchvision.transforms as transforms


from PIL import Image
from roboflow import Roboflow
from torch.utils.data import Dataset
from torchvision import transforms, datasets
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict, load_metric
from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer, AutoImageProcessor

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


## Dataset


In [2]:
class ChordsDataset(Dataset):
    def __init__(self, root_dir, annotation_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        
        with open(annotation_file, 'r') as f:
            self.coco = json.load(f)
        
        self.images = self.coco['images']
        self.annotations = self.coco['annotations']
        self.categories = self.coco['categories']

        self.images = {img['id']: img for img in self.images}

        self.nr_of_classes = len(self.categories)

        # Create a mapping from image_id to annotations
        self.image_to_label = {}
        for annotation in self.annotations:
            img = self.images[annotation['image_id']]
            self.image_to_label[img["id"]] = {
                "file_name": img["file_name"],
                "category": annotation["category_id"]
            }

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        metadata = self.image_to_label[idx]
        img_path = os.path.join(self.root_dir, metadata["file_name"])
        image = Image.open(img_path).convert('RGB')        
        
        if self.transform:
            image = self.transform(image)

        # Convert labels to tensor
        return {
            "image": image,
            "label": torch.tensor(metadata["category"]),
        }

In [3]:
def download_roboflow_data(config):
    """
    Download dataset from RoboFlow.
    """
    roboflow_config = config['data']['roboflow']
    roboflow = Roboflow(api_key=roboflow_config["api_key"])
    project = roboflow.workspace(roboflow_config["workspace"]).project(roboflow_config["project"])
    version = project.version(roboflow_config["version"])
    dataset = version.download(model_format=roboflow_config["version_download"])

    dest_path = config['data']['path'] + "/" + dataset.name

    if not os.path.exists(dest_path):
        shutil.move(src=dataset.location, dst=dest_path)

    print(f"Dataset downloaded and extracted to {config['data']['path']}")
    return dataset, dest_path

In [4]:
def load_config(config_path):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

In [5]:
def create_transform(aug_config, processor):
    transform_list = []
    
    # Add transforms based on configuration
    # if 'random_resize_crop' in aug_config:
    #     transform_list.append(transforms.RandomResizedCrop(**aug_config['random_resize_crop']))
    # if 'random_horizontal_flip' in aug_config:
    #     transform_list.append(transforms.RandomHorizontalFlip(aug_config['random_horizontal_flip']))
    # if 'color_jitter' in aug_config:
    #     transform_list.append(transforms.ColorJitter(**aug_config['color_jitter']))
    # if 'random_rotation' in aug_config:
    #     transform_list.append(transforms.RandomRotation(aug_config['random_rotation']))
    
    # Always include resizing, ToTensor, and normalization
    transform_list.extend([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        # transforms.Normalize(mean=processor.image_mean, std=processor.image_std),
    ])
    
    return transforms.Compose(transform_list)

In [6]:
def get_transforms(config, processor):
    train_transform = create_transform(config['data']['train_augmentation'], processor)
    val_transform = create_transform(config['data'].get('val_augmentation', {}), processor)
    
    return train_transform, val_transform

In [7]:
def load_data(data_dir, transform):
    return datasets.ImageFolder(data_dir, transform=transform)

In [8]:
def organize_images_by_class(src_ds_path, ds_final_path):
    # List of subdirectories to process
    sub_dirs = ['train', 'valid', 'test']

    os.makedirs(ds_final_path, exist_ok=True)

    for sub_dir in sub_dirs:
        current_dir = os.path.join(src_ds_path, sub_dir)
        
        # List all files in the current directory
        files = [f for f in os.listdir(current_dir) if os.path.isfile(os.path.join(current_dir, f))]
        
        for f in files:
            # Get the first letter of the file
            first_letter = f[0].upper()
            
            if not first_letter.isalpha():
                continue

            # Create a new directory for this letter if it doesn't exist
            letter_dir = os.path.join(ds_final_path, first_letter)
            if not os.path.exists(letter_dir):
                os.makedirs(letter_dir)
            
            # Move the file to the new directory
            src_path = os.path.join(current_dir, f)
            dst_path = os.path.join(letter_dir, f)
            shutil.move(src_path, dst_path)
    
    shutil.rmtree(src_ds_path)

    # Idk why the fuck this script created a copy of the Project dir
    # shutil.rmtree("src/classification/Project")
    print("Image organization complete!")

In [9]:
f_run_config = "config.yml"
f_wandb_config = "wandb.yml" 

In [10]:
# Load configuration
config = load_config(f_run_config)
wandb_config = load_config(f_wandb_config)

In [11]:
# Download data from RoboFlow if specified
if config['data'].get('use_roboflow', False):
    _, location = download_roboflow_data(config)

dataset_name = "Guitar-Chords"

organize_images_by_class(location, "datasets/" + dataset_name)

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Guitar-Chord-1 to coco:: 100%|██████████| 166698/166698 [00:22<00:00, 7340.65it/s]





Extracting Dataset Version Zip to Guitar-Chord-1 in coco:: 100%|██████████| 2525/2525 [00:01<00:00, 1389.08it/s]


Dataset downloaded and extracted to ./Project/src/classification/dataset
Image organization complete!


In [12]:
# Initialize wandb
wandb.require("core")
wandb.init(
    project=wandb_config["project"],
    name=wandb_config['name'] + "-" + wandb.util.generate_id(),
    config=wandb_config,
    entity=wandb_config["entity"]
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdhimitrios-duka1[0m ([33mhwga-cj[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
 # Load pre-trained model and processor
model = ViTForImageClassification.from_pretrained(config['model']['pretrained_weights'])
processor = ViTImageProcessor.from_pretrained(config['model']['pretrained_weights'])

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Get transforms
train_transform, base_transform = get_transforms(config, processor)

In [15]:
# Load the ds
ds = load_dataset("imagefolder", data_dir="datasets/Guitar-Chords")

# Split the data
ds = ds['train'].train_test_split(test_size=0.3, stratify_by_column="label")  # 70% train, 30% test
ds_test = ds['test'].train_test_split(test_size=0.5, stratify_by_column="label")  # 30% test --> 15% valid, 15% test
ds = DatasetDict({
    'train': ds['train'],
    'test': ds_test['test'],
    'valid': ds_test['train']
})
    
del ds_test

ds

Downloading data: 100%|██████████| 2517/2517 [00:00<00:00, 85562.66files/s]
Generating train split: 2517 examples [00:00, 4669.24 examples/s]


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1761
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 378
    })
    valid: Dataset({
        features: ['image', 'label'],
        num_rows: 378
    })
})

In [16]:
labels = ds['train'].features['label']
labels

ClassLabel(names=['A', 'B', 'C', 'D', 'E', 'F', 'G'], id=None)

In [17]:
def transform(batch):
    # Resize the images to the desired size
    resized_images = [transforms.Resize((224, 224))(x.convert("RGB")) for x in batch['image']]

    # Convert resized images to pixel values
    inputs = processor(resized_images, return_tensors='pt')
        
    # Don't forget to include the labels!
    inputs['label'] = batch['label']

    return inputs

prepared_ds = ds.with_transform(transform)

In [18]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

metric = load_metric("accuracy")

load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate


In [19]:
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [20]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'

processor = AutoImageProcessor.from_pretrained(model_name_or_path)
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels.names),
    id2label={str(i): c for i, c in enumerate(labels.names)},
    label2id={c: str(i) for i, c in enumerate(labels.names)},
    ignore_mismatched_sizes=True
)

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=config['training']['output_dir'],
    num_train_epochs=config['training']['num_epochs'],
    per_device_train_batch_size=config['training']['batch_size'],
    per_device_eval_batch_size=config['training']['batch_size'],
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=float(config['training']['learning_rate']),
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb",
    remove_unused_columns=False,
    logging_steps=500,
    save_total_limit=1,
    fp16=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["valid"],
    compute_metrics=compute_metrics,
    tokenizer=processor
)

# Train the model
trainer.train()

# # Save the fine-tuned model
# trainer.save_model(config['training']['final_model_path'])

# Close wandb run
wandb.finish()

  0%|          | 1/555 [00:35<5:27:00, 35.42s/it]