# WikiArt Impressionism Dataset Curation

This notebook processes the WikiArt dataset to create a curated subset of Impressionist paintings.

**Requirements:**
- High-RAM runtime (recommended: 25-50GB)
- Google Drive mounted for cache storage
- Hugging Face account token

In [None]:
# Install required packages
!pip install datasets pillow huggingface-hub tqdm

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set up Hugging Face credentials
from huggingface_hub import login

# Replace with your token from https://huggingface.co/settings/tokens
HF_TOKEN = "your_token_here"  # @param {type:"string"}
login(HF_TOKEN)

In [None]:
# Clone the repository and set up paths
import os
import sys
from pathlib import Path

# Create project directory if it doesn't exist
project_dir = Path('/content/drive/MyDrive/fine-tuning')
project_dir.mkdir(parents=True, exist_ok=True)

# Clone if not exists
if not (project_dir / '.git').exists():
    !cd {project_dir} && git clone https://github.com/your-username/fine-tuning.git .

# Add project root to Python path
sys.path.append(str(project_dir))

In [None]:
# Import required libraries
import logging
from src.data.dataset_curator import DatasetCurator
import pandas as pd
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Initialize with Google Drive cache directory
cache_dir = project_dir / 'cache'
curator = DatasetCurator(cache_dir=str(cache_dir))

try:
    # Process dataset
    dataset = curator.process_dataset()
    
    # Print dataset statistics
    print("\nDataset Statistics:")
    print(f"Total images: {len(dataset)}")
    
    # Convert to pandas for analysis
    df = dataset.to_pandas()
    
    # Genre distribution
    genre_counts = df['genre'].value_counts()
    print("\nGenre Distribution:")
    print(genre_counts)
    
    # Plot genre distribution
    plt.figure(figsize=(10, 6))
    genre_counts.plot(kind='bar')
    plt.title('Genre Distribution in Curated Dataset')
    plt.xlabel('Genre ID')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    logging.error(f"Error processing dataset: {e}")
    raise

In [None]:
# Upload to Hugging Face Hub
try:
    repo_name = "your-username/impressionism-curated"  # @param {type:"string"}
    private = True  # @param {type:"boolean"}
    
    print(f"\nUploading dataset to {repo_name}...")
    repo_url = curator.upload_to_hub(dataset, repo_name, private=private)
    print(f"Successfully uploaded dataset to: {repo_url}")
    
except Exception as e:
    logging.error(f"Error uploading to Hugging Face Hub: {e}")
    raise