<a href="https://colab.research.google.com/github/docmhvr/UAV-Based-Powerline-Problem-Inspection-Using-Machine-Learning/blob/main/Powerline_Inspection_dataset_to_HF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Hello Hugging Face")

Hello Hugging Face


### Convert a YOLOv8 dataset to a Hugging Face dataset for fault detection in Powerline Components

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [3]:
import os
import zipfile
import yaml
from google.colab import drive
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel, Image as HFImage
from huggingface_hub import HfApi

# Step 1: Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Step 2: Define paths
drive_path = '/content/drive/MyDrive/HuggingFace'  # Path to your Google Drive
zip_file_path = os.path.join(drive_path, 'data.zip')  # Replace with your zip file name
extract_dir = '/content/data'  # Directory to extract files to

# Step 3: Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Update extract dir path for data folder
extract_dir = os.path.join(extract_dir, 'data')

In [11]:
print(extract_dir)

/content/data/data


In [12]:
# Step 4: Load data.yaml file
data_yaml_path = os.path.join(extract_dir, 'data.yaml')
with open(data_yaml_path, 'r') as stream:
    data_config = yaml.safe_load(stream)

# Step 5: Get paths from the YAML file
train_dir = os.path.join(extract_dir, 'train')
val_dir = os.path.join(extract_dir, 'valid')
test_dir = os.path.join(extract_dir, 'test')

In [13]:
print(train_dir)
print(val_dir)
print(test_dir)

/content/data/data/train
/content/data/data/valid
/content/data/data/test


In [14]:
# Helper function to convert YOLO format to COCO format
def yolo_to_coco(bbox, img_width, img_height):
    x_center, y_center, width, height = bbox
    x_min = (x_center - width / 2) * img_width
    y_min = (y_center - height / 2) * img_height
    x_max = (x_center + width / 2) * img_width
    y_max = (y_center + height / 2) * img_height
    return [x_min, y_min, x_max, y_max]

In [18]:
from PIL import Image
import os

def load_yolo_data(image_dir, label_dir):
    data = []
    for image_file in os.listdir(image_dir):
        img_path = os.path.join(image_dir, image_file)
        label_file = os.path.splitext(image_file)[0] + '.txt'
        label_path = os.path.join(label_dir, label_file)

        if os.path.exists(label_path):
            # Load the image using PIL directly
            image = Image.open(img_path)
            img_width, img_height = image.size

            with open(label_path, 'r') as f:
                lines = f.readlines()

            bboxes = []
            labels = []

            for line in lines:
                parts = line.strip().split()
                if len(parts) >= 5:  # Ensure that the line has at least 5 parts (class + bbox coordinates)
                    class_id = int(parts[0])
                    bbox = list(map(float, parts[1:5]))  # Convert bbox coordinates to float

                    # Convert YOLO format to COCO format
                    coco_bbox = yolo_to_coco(bbox, img_width, img_height)

                    bboxes.append(coco_bbox)
                    labels.append(class_id)
                else:
                    print(f"Skipping invalid label in file: {label_path}")

            if bboxes and labels:
                data.append({"image": image, "bboxes": bboxes, "labels": labels})

    return data

In [21]:
from datasets import Dataset, Features, Value, Sequence, ClassLabel

def load_datasets(train_dir, val_dir, test_dir, class_names):
    # Load YOLO data for train, val, and test
    train_data = load_yolo_data(os.path.join(train_dir, "images"), os.path.join(train_dir, "labels"))
    val_data = load_yolo_data(os.path.join(val_dir, "images"), os.path.join(val_dir, "labels"))
    test_data = load_yolo_data(os.path.join(test_dir, "images"), os.path.join(test_dir, "labels"))

    # Convert list of dictionaries to dictionary of lists
    def convert_to_dict(data_list):
        return {
            "image": [item["image"] for item in data_list],
            "bboxes": [item["bboxes"] for item in data_list],
            "labels": [item["labels"] for item in data_list]
        }

    train_dict = convert_to_dict(train_data)
    val_dict = convert_to_dict(val_data)
    test_dict = convert_to_dict(test_data)

    # Define the dataset features
    features = Features({
        "image": HFImage(),
        "bboxes": Sequence(Sequence(Value("float32"))),
        "labels": Sequence(ClassLabel(names=class_names)),
    })

    # Create Hugging Face datasets
    train_dataset = Dataset.from_dict(train_dict, features=features)
    val_dataset = Dataset.from_dict(val_dict, features=features)
    test_dataset = Dataset.from_dict(test_dict, features=features)

    return {
        "train": train_dataset,
        "val": val_dataset,
        "test": test_dataset
    }


In [24]:
# Load the class names from data.yaml
class_names = data_config['names']
print(class_names)

# Load the datasets
dataset = load_datasets(train_dir, val_dir, test_dir, class_names)

['Broken Cable', 'Broken Insulator', 'Cable', 'Insulators', 'Tower', 'Vegetation']


In [26]:
# Save each dataset (train, val, test) to disk
dataset['train'].save_to_disk('/content/huggingface_dataset/train')
dataset['val'].save_to_disk('/content/huggingface_dataset/val')
dataset['test'].save_to_disk('/content/huggingface_dataset/test')

Saving the dataset (0/1 shards):   0%|          | 0/1794 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/77 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/41 [00:00<?, ? examples/s]

In [27]:
!pip install datasets huggingface_hub



In [30]:
from huggingface_hub import login

from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

# Replace 'your_token' with your Hugging Face token
login(token=HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [31]:
from datasets import DatasetDict
from huggingface_hub import HfApi, HfFolder
import os

# Load datasets
datasets = load_datasets(train_dir, val_dir, test_dir, class_names)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': datasets['train'],
    'validation': datasets['val'],
    'test': datasets['test']
})

In [32]:
# Define the repository name
repo_name = "docmhvr/powerline-components-and-faults"

# Push dataset to the Hugging Face Hub
def push_to_hub(repo_name, dataset_dict):
    dataset_dict.push_to_hub(repo_name)

# Push the dataset
push_to_hub(repo_name, dataset_dict)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1794 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

In [33]:
readme_content = """
# Powerline Components and Faults Dataset

## Overview

The **Powerline Components and Faults Dataset** is a dataset designed for object detection tasks involving powerline components and associated faults. It provides images of powerline infrastructure along with annotated bounding boxes for various components and faults.

This dataset is useful for training and evaluating models on powerline inspection, maintenance, and safety applications.

## Dataset Structure

The dataset is organized into the following directories:

- `train/`: Contains training images and their corresponding annotation files.
- `validation/`: Contains validation images and their corresponding annotation files.
- `test/`: Contains test images and their corresponding annotation files.

Each image file has a corresponding `.txt` file in the same directory, which contains the annotations in YOLO format.

## Data Format

### Images

- Format: JPEG/PNG
- Resolution: Various resolutions

### Annotations

Annotations are provided in YOLO format, where each line in a `.txt` file corresponds to an object in the image. The format is:

```
class_id x_center y_center width height
```

- `class_id`: The ID of the object class.
- `x_center`, `y_center`: The center of the bounding box (normalized between 0 and 1).
- `width`, `height`: The dimensions of the bounding box (normalized between 0 and 1).

## Usage

You can use this dataset with popular machine learning frameworks and libraries. Below is an example of how to load the dataset using the Hugging Face `datasets` library:

```python
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("docmhvr/powerline-components-and-faults")

# Access the train, validation, and test splits
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']
```

## License

This dataset is provided under the [MIT License](https://opensource.org/licenses/MIT). See the [LICENSE](LICENSE) file for more details.

## Acknowledgements

This dataset was created as part of the research work on powerline inspection and fault detection. Data was collected using DJI Mini drone and manually compiled and annotated using Roboflow.

## Research reference

You can find the related Research work published in IEEE, full text avaliable on researchgate here,

[Research Paper](https://www.researchgate.net/publication/381461493_UAV-Based_Powerline_Problem_Inspection_and_Classification_using_Machine_Learning_Approaches)

## Contribution

If you would like to contribute to this dataset, please feel free to open an issue or submit a pull request on the [GitHub repository](https://github.com/docmhvr/UAV-Based-Powerline-Problem-Inspection-Using-Machine-Learning).
"""

In [36]:
with open("README.md", "w") as f:
  f.write(readme_content)

In [38]:
from huggingface_hub import HfApi, Repository, create_repo, upload_file

# Initialize the Hugging Face API
api = HfApi()

# Create or update the README.md file in the repository
upload_file( path_or_fileobj="README.md", path_in_repo="README.md", repo_id=repo_name, repo_type="dataset" )

print("README.md file has been updated successfully!")

- empty or missing yaml metadata in repo card


README.md file has been updated successfully!
