# WikiArt Impressionism Dataset Curation

This notebook processes the WikiArt dataset to create a curated subset of Impressionist paintings.

**Requirements:**
- High-RAM runtime (recommended: 25-50GB)
- Google Drive mounted for cache storage
- Hugging Face account token

In [None]:
# Install required packages
!pip install datasets pillow huggingface-hub tqdm

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set up Hugging Face credentials
from huggingface_hub import login

# Replace with your token from https://huggingface.co/settings/tokens
HF_TOKEN = "your_token_here"  # @param {type:"string"}
login(HF_TOKEN)

In [None]:
# Clone the repository and set up paths
import os

# Clone if not exists
if not os.path.exists('fine-tuning'):
    !git clone https://github.com/your-username/fine-tuning.git

# Add project root to Python path
import sys
sys.path.append('fine-tuning')

In [None]:
# Import and run the curator
from src.data.dataset_curator import DatasetCurator

# Initialize with Google Drive cache directory
curator = DatasetCurator(cache_dir='/content/drive/MyDrive/fine-tuning/cache')

# Process dataset
dataset = curator.process_dataset()

# Print dataset statistics
print("Dataset statistics:")
print(f"Total images: {len(dataset)}")
genre_counts = dataset.to_pandas()['genre'].value_counts()
print("\nGenre distribution:")
print(genre_counts)

In [None]:
# Upload to Hugging Face Hub
repo_name = "your-username/impressionism-curated"  # @param {type:"string"}
private = True  # @param {type:"boolean"}

repo_url = curator.upload_to_hub(dataset, repo_name, private=private)
print(f"Dataset uploaded to: {repo_url}")