# Script to create and push a dataset to the Huggingface Hub

In [1]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Value, Image
from huggingface_hub import notebook_login

In [2]:
# Define image path
data_dir = '/Users/fabian.fuerst/Documents/GitHub/Fine-tuned-LLaVA-Vision-and-Language/data_prep/data'

# Define path to the train and test csv files
train_csv_file = os.path.join(data_dir, 'train_captions.csv')
test_csv_file = os.path.join(data_dir, 'test_captions.csv')

In [3]:
# Function to load the image data
def load_image(image_path):
    with open(image_path, 'rb') as image_file:
        return image_file.read()

In [4]:
# Cretae a Huggingface dataset from the Dataframe
def create_dataset(csv_file, split_name):
    # Load the csv file
    df = pd.read_csv(csv_file)
    # Load the image data
    df['image'] = df['image_name'].map(lambda image_name: load_image(os.path.join(data_dir, image_name)))
    # Remove the 'image_name' column as it is no longer needed
    df = df.drop(columns=['image_name'])
    # Reset the index to avoid '__index_level_0__' column
    df = df.reset_index(drop=True)
    # Define the features
    features = Features({
        'image': Image(),
        'captions': Value('string')
    })
    # Create the dataset
    dataset = Dataset.from_pandas(df, features=features)
    return dataset

In [5]:
# Create the train and test datasets
train_dataset = create_dataset(train_csv_file, 'train')
test_dataset = create_dataset(test_csv_file, 'test')

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [8]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [9]:
dataset_dict.push_to_hub('fuerstfabian/cat_figures')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/fuerstfabian/cat_figures/commit/09f9d4cb09dda390d9c069cdcf137364678e67bf', commit_message='Upload dataset', commit_description='', oid='09f9d4cb09dda390d9c069cdcf137364678e67bf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/fuerstfabian/cat_figures', endpoint='https://huggingface.co', repo_type='dataset', repo_id='fuerstfabian/cat_figures'), pr_revision=None, pr_num=None)