In [4]:
!pip -q install datasets huggingface_hub tqdm pandas matplotlib tqdm

In [2]:
from huggingface_hub import login, HfApi

# Function to setup Hugging Face authentication
def setup_huggingface_auth(token=None):
    """
    Setup Hugging Face authentication using token.
    If token is not provided, prompts for input.
    """
    if token is None:
        from getpass import getpass
        token = getpass("Enter your Hugging Face token: ")

    try:
        login(token)
        api = HfApi()
        user_info = api.whoami()
        print(f"Successfully logged in as: {user_info['name']}")
        return True
    except Exception as e:
        print(f"Error logging in: {e}")
        return False

# Setup Hugging Face authentication
setup_huggingface_auth()

Enter your Hugging Face token: ··········


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Successfully logged in as: dolphinium


True

In [5]:
# dataset_access.py
import os
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from huggingface_hub import HfApi, login
from tqdm.auto import tqdm

# Configuration settings
DATASET_NAME = "huggan/wikiart"
OUTPUT_DIR = "curated_dataset"
HF_DATASET_REPO = "dolphinium/flux-impressionism-dataset"  # Change to your username
DATASET_SIZE = 1000
RANDOM_SEED = 42

# Genre composition targets
COMPOSITION = {
    "landscape": 0.3,    # Genre ID: 4
    "portrait": 0.3,     # Genre ID: 6
    "cityscape": 0.2,    # Genre ID: 1
    "still_life": 0.2,   # Genre ID: 9
}

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to download the dataset
def download_wikiart_dataset():
    print("Loading WikiArt dataset...")
    dataset = load_dataset(DATASET_NAME)
    print(f"Dataset loaded with {len(dataset['train'])} images")
    return dataset

In [6]:
# dataset_filtering.py
import numpy as np

def analyze_wikiart_dataset(dataset):
    """Analyze the WikiArt dataset distribution."""
    # Get unique genres and styles
    genres = {}
    styles = {}

    for item in tqdm(dataset['train'], desc="Analyzing dataset"):
        genre_id = item['genre']
        style_id = item['style']

        if genre_id not in genres:
            genres[genre_id] = 0
        genres[genre_id] += 1

        if style_id not in styles:
            styles[style_id] = 0
        styles[style_id] += 1

    print(f"Found {len(genres)} unique genres and {len(styles)} unique styles")
    return genres, styles

def filter_impressionism_artworks(dataset):
    """Filter the dataset to only include Impressionism style (ID: 12)."""
    impressionism_style_id = 12

    # Filter for Impressionism style
    impressionism_works = [
        item for item in tqdm(dataset['train'], desc="Filtering Impressionism works")
        if item['style'] == impressionism_style_id
    ]

    print(f"Found {len(impressionism_works)} Impressionism artworks")
    return impressionism_works

def create_curated_dataset(impressionism_works):
    """Create a curated dataset with the specified composition."""
    # Map genre IDs to names for easier reference
    genre_to_id = {
        "landscape": 4,
        "portrait": 6,
        "cityscape": 1,
        "still_life": 9
    }

    # Group works by genre
    genre_groups = {genre: [] for genre in genre_to_id.keys()}

    for item in tqdm(impressionism_works, desc="Grouping by genre"):
        genre_id = item['genre']
        for genre_name, id_val in genre_to_id.items():
            if genre_id == id_val:
                genre_groups[genre_name].append(item)
                break

    # Print found counts for each genre
    for genre, works in genre_groups.items():
        print(f"Found {len(works)} Impressionism {genre} artworks")

    # Calculate how many images to select from each genre
    selected_counts = {
        genre: int(DATASET_SIZE * percentage)
        for genre, percentage in COMPOSITION.items()
    }

    # Adjust to ensure we get exactly DATASET_SIZE images
    total_selected = sum(selected_counts.values())
    if total_selected < DATASET_SIZE:
        # Add remaining images to the first genre that has enough
        for genre in selected_counts:
            if len(genre_groups[genre]) > selected_counts[genre]:
                selected_counts[genre] += DATASET_SIZE - total_selected
                break

    # Sample images from each genre
    np.random.seed(RANDOM_SEED)
    curated_dataset = []

    for genre, count in selected_counts.items():
        available = len(genre_groups[genre])
        if available < count:
            print(f"Warning: Not enough {genre} images (need {count}, have {available})")
            # Take all available and adjust other categories
            selected_counts[genre] = available

        # Randomly sample from this genre
        sampled = np.random.choice(
            len(genre_groups[genre]),
            size=min(count, available),
            replace=False
        )

        for idx in sampled:
            curated_dataset.append(genre_groups[genre][idx])

    print(f"Created curated dataset with {len(curated_dataset)} images")

    # Final composition report
    final_composition = {}
    for item in curated_dataset:
        genre_id = item['genre']
        for genre_name, id_val in genre_to_id.items():
            if genre_id == id_val:
                if genre_name not in final_composition:
                    final_composition[genre_name] = 0
                final_composition[genre_name] += 1
                break

    print("Final dataset composition:")
    for genre, count in final_composition.items():
        percentage = count / len(curated_dataset) * 100
        print(f"  - {genre}: {count} images ({percentage:.1f}%)")

    return curated_dataset

In [7]:
# dataset_upload.py
import os
from datasets import Dataset, Features, Image, ClassLabel, Value
import numpy as np
import shutil

def save_curated_dataset(curated_dataset, output_dir=OUTPUT_DIR):
    """Save the curated dataset to disk."""
    # Clear output directory if it exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    # Create subdirectories for images and metadata
    images_dir = os.path.join(output_dir, "images")
    os.makedirs(images_dir, exist_ok=True)

    # Create metadata entries
    metadata = []

    for i, item in enumerate(tqdm(curated_dataset, desc="Saving images")):
        # Save the image
        img = item["image"]
        img_filename = f"img_{i:05d}.jpg"
        img_path = os.path.join(images_dir, img_filename)
        img.save(img_path)

        # Create metadata entry
        metadata.append({
            "file_name": img_filename,
            "artist": item["artist"],
            "genre": item["genre"],
            "style": item["style"]
        })

    # Save metadata as CSV
    pd.DataFrame(metadata).to_csv(os.path.join(output_dir, "metadata.csv"), index=False)
    print(f"Curated dataset saved to {output_dir}")
    return images_dir, os.path.join(output_dir, "metadata.csv")

def create_hf_dataset(images_dir, metadata_path):
    """Create a Hugging Face Dataset from the saved files."""
    # Load metadata
    metadata_df = pd.read_csv(metadata_path)

    # Create image paths
    image_paths = [os.path.join(images_dir, row["file_name"]) for _, row in metadata_df.iterrows()]

    # Create dataset dictionary
    dataset_dict = {
        "image": image_paths,
        "artist": metadata_df["artist"].tolist(),
        "genre": metadata_df["genre"].tolist(),
        "style": metadata_df["style"].tolist(),
    }

    # Create HF dataset
    features = Features({
        "image": Image(),
        "artist": Value("int64"),
        "genre": Value("int64"),
        "style": Value("int64"),
    })

    hf_dataset = Dataset.from_dict(dataset_dict, features=features)
    print(f"Created Hugging Face dataset with {len(hf_dataset)} examples")
    return hf_dataset

def upload_to_hf(hf_dataset, repo_id=HF_DATASET_REPO):
    """Upload the dataset to Hugging Face Hub."""
    # Login to Hugging Face (you'll need to have a token)
    login()

    # Push to hub
    hf_dataset.push_to_hub(
        repo_id,
        private=False,
        token=None,  # Will use the token from login()
    )

    print(f"Dataset uploaded to Hugging Face Hub: {repo_id}")
    print("You can access it with: load_dataset(f'{repo_id}')")

In [9]:
# Download the WikiArt dataset
dataset = download_wikiart_dataset()

Loading WikiArt dataset...


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/72 [00:00<?, ?files/s]

train-00043-of-00072.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00045-of-00072.parquet:   0%|          | 0.00/452M [00:00<?, ?B/s]

train-00046-of-00072.parquet:   0%|          | 0.00/458M [00:00<?, ?B/s]

train-00047-of-00072.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00048-of-00072.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00049-of-00072.parquet:   0%|          | 0.00/489M [00:00<?, ?B/s]

train-00050-of-00072.parquet:   0%|          | 0.00/472M [00:00<?, ?B/s]

train-00051-of-00072.parquet:   0%|          | 0.00/515M [00:00<?, ?B/s]

train-00052-of-00072.parquet:   0%|          | 0.00/514M [00:00<?, ?B/s]

train-00053-of-00072.parquet:   0%|          | 0.00/509M [00:00<?, ?B/s]

train-00054-of-00072.parquet:   0%|          | 0.00/462M [00:00<?, ?B/s]

train-00055-of-00072.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00056-of-00072.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00057-of-00072.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

train-00058-of-00072.parquet:   0%|          | 0.00/359M [00:00<?, ?B/s]

train-00059-of-00072.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

train-00060-of-00072.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

train-00061-of-00072.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

train-00062-of-00072.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00063-of-00072.parquet:   0%|          | 0.00/435M [00:00<?, ?B/s]

train-00064-of-00072.parquet:   0%|          | 0.00/446M [00:00<?, ?B/s]

train-00065-of-00072.parquet:   0%|          | 0.00/439M [00:00<?, ?B/s]

train-00066-of-00072.parquet:   0%|          | 0.00/448M [00:00<?, ?B/s]

train-00067-of-00072.parquet:   0%|          | 0.00/436M [00:00<?, ?B/s]

train-00068-of-00072.parquet:   0%|          | 0.00/474M [00:00<?, ?B/s]

train-00069-of-00072.parquet:   0%|          | 0.00/454M [00:00<?, ?B/s]

train-00070-of-00072.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

train-00071-of-00072.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/81444 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

Dataset loaded with 81444 images


In [10]:
# Analyze dataset
genres, styles = analyze_wikiart_dataset(dataset)

Analyzing dataset:   0%|          | 0/81444 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Filter for Impressionism
impressionism_works = filter_impressionism_artworks(dataset)

Filtering Impressionism works:   0%|          | 0/81444 [00:00<?, ?it/s]

In [None]:
# Create curated dataset
curated_dataset = create_curated_dataset(impressionism_works)

In [None]:
# Save dataset to disk
images_dir, metadata_path = save_curated_dataset(curated_dataset)

In [None]:
# Create HF dataset
hf_dataset = create_hf_dataset(images_dir, metadata_path)

In [None]:
# Upload to HF (uncomment when ready)
upload_to_hf(hf_dataset)