## ENV SETUP

In [1]:
# Install required packages
!pip install -q torch transformers diffusers accelerate datasets peft gradio pillow matplotlib tensorboard numpy pandas scikit-learn scipy tqdm huggingface-hub

# Import necessary libraries
import torch
from huggingface_hub import login, HfApi
from diffusers import DiffusionPipeline
import os
import sys
from IPython.display import display, HTML

# Check PyTorch version and CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"CUDA memory cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Function to setup Hugging Face authentication
def setup_huggingface_auth(token=None):
    """
    Setup Hugging Face authentication using token.
    If token is not provided, prompts for input.
    """
    if token is None:
        from getpass import getpass
        token = getpass("Enter your Hugging Face token: ")

    try:
        login(token)
        api = HfApi()
        user_info = api.whoami()
        print(f"Successfully logged in as: {user_info['name']}")
        return True
    except Exception as e:
        print(f"Error logging in: {e}")
        return False

# Setup Hugging Face authentication
setup_huggingface_auth()

Enter your Hugging Face token: ··········
Successfully logged in as: dolphinium


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


True

In [3]:
def verify_environment():
    """
    Verify that all necessary components are properly set up.
    Returns a dictionary with status of each component.
    """
    status = {
        "cuda": False,
        "huggingface": False,
        "memory": False,
        "disk_space": False
    }

    # Check CUDA
    status["cuda"] = torch.cuda.is_available()

    # Check Hugging Face authentication
    try:
        api = HfApi()
        api.whoami()
        status["huggingface"] = True
    except:
        pass

    # Check available memory
    if status["cuda"]:
        total_memory = torch.cuda.get_device_properties(0).total_memory
        status["memory"] = total_memory > 10 * 1024 * 1024 * 1024  # Check if > 10GB

    # Check available disk space
    try:
        import shutil
        total, used, free = shutil.disk_usage("/")
        status["disk_space"] = free > 50 * 1024 * 1024 * 1024  # Check if > 50GB free
    except:
        pass

    return status

# Run verification
status = verify_environment()
print("\nEnvironment Status:")
for component, is_ready in status.items():
    print(f"{component}: {'✅' if is_ready else '❌'}")

if not all(status.values()):
    print("\nWarning: Some components are not properly set up!")
    if not status["cuda"]:
        print("- CUDA is not available. GPU acceleration will not work.")
    if not status["huggingface"]:
        print("- Hugging Face authentication failed. Please check your token.")
    if not status["memory"]:
        print("- Insufficient GPU memory. Training may be limited.")
    if not status["disk_space"]:
        print("- Low disk space. This might affect dataset storage and training.")


Environment Status:
cuda: ✅
huggingface: ✅
memory: ✅
disk_space: ✅


## dataset access and storage components

In [4]:
import os
from pathlib import Path
import requests
import json
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset
import pandas as pd

class DatasetConfig:
    """Configuration for WikiArt Impressionism dataset"""
    def __init__(self):
        self.base_path = Path('/content/data')
        self.dataset_path = self.base_path / 'wikiart'
        self.metadata_path = self.dataset_path / 'metadata.json'
        self.images_path = self.dataset_path / 'images'

        # Create necessary directories
        self.dataset_path.mkdir(parents=True, exist_ok=True)
        self.images_path.mkdir(parents=True, exist_ok=True)

    def setup_paths(self):
        """Create directory structure and return paths"""
        paths = {
            'base': str(self.base_path),
            'dataset': str(self.dataset_path),
            'metadata': str(self.metadata_path),
            'images': str(self.images_path)
        }
        print("Dataset directories created:")
        for name, path in paths.items():
            print(f"- {name}: {path}")
        return paths

# Initialize dataset configuration
dataset_config = DatasetConfig()
paths = dataset_config.setup_paths()

Dataset directories created:
- base: /content/data
- dataset: /content/data/wikiart
- metadata: /content/data/wikiart/metadata.json
- images: /content/data/wikiart/images


In [None]:
def load_wikiart_dataset():
    """
    Load the WikiArt dataset from Hugging Face Hub.
    Returns the dataset object.
    """
    try:
        # Load the Impressionism subset from WikiArt
        dataset = load_dataset(
            "huggan/wikiart",
            split="train",
            cache_dir=str(dataset_config.base_path)
        )

        # Filter for Impressionism style
        impressionism_dataset = dataset.filter(
            lambda example: example['style'] == 12  # 12Impressionism
        )

        print(f"Total images in dataset: {len(impressionism_dataset)}")
        return impressionism_dataset

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def analyze_dataset(dataset):
    """
    Analyze the dataset and print basic statistics.
    """
    if dataset is None:
        return

    # Convert to pandas for easier analysis
    df = dataset.to_pandas()

    print("\nDataset Statistics:")
    print(f"Total number of images: {len(df)}")
    print("\nArtists distribution:")
    print(df['artist'].value_counts().head())
    print("\nYears distribution:")
    print(df['year'].value_counts().sort_index().head())

    # Save metadata
    metadata = {
        'total_images': len(df),
        'artists': df['artist'].unique().tolist(),
        'year_range': [df['year'].min(), df['year'].max()],
        'styles': df['style'].unique().tolist()
    }

    with open(dataset_config.metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"\nMetadata saved to: {dataset_config.metadata_path}")

# Load and analyze dataset
dataset = load_wikiart_dataset()
if dataset is not None:
    analyze_dataset(dataset)

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

Filter:   0%|          | 0/81444 [00:00<?, ? examples/s]

In [None]:
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi
import pandas as pd

def create_curated_dataset():
    """Create a curated Impressionism dataset"""

    # Filter for Impressionism
    impressionism = dataset.filter(
        lambda x: x['style'] == 12
    )

    # Convert to pandas for easier filtering
    df = impressionism.to_pandas()

    # Add quality metrics
    df['resolution'] = df['image'].apply(lambda x: x.size)

    # Filter criteria
    high_quality = df[
        (df['resolution'] >= 1024 * 1024) &  # Minimum resolution
        (df['year'] >= 1860) &               # Core Impressionist period
        (df['year'] <= 1900)
    ]

    # Stratified sampling by subject
    categories = {
        'landscape': 400,    # 40%
        'cityscape': 200,    # 20%
        'portrait': 200,     # 20%
        'still_life': 200    # 20%
    }

    selected_rows = []
    for category, count in categories.items():
        category_df = high_quality[
            high_quality['genre'].str.contains(category, case=False, na=False)
        ]
        selected_rows.extend(
            category_df.sample(min(count, len(category_df))).index
        )

    # Create final dataset
    curated_df = df.loc[selected_rows]

    return Dataset.from_pandas(curated_df)

def push_to_hub(dataset, repo_name="flux-impressionism-dataset"):
    """Push dataset to Hugging Face Hub"""
    dataset.push_to_hub(
        repo_name,
        private=False,
        description="""
        A curated dataset of high-quality Impressionist paintings for fine-tuning
        text-to-image models. Contains ~1000 carefully selected images representing
        key characteristics of Impressionist art.
        """,
        tags=['computer-vision', 'art', 'impressionism']
    )

# Create dataset card content
dataset_card = """
# Flux Impressionism Dataset

## Dataset Summary
A carefully curated collection of high-quality Impressionist paintings, specifically designed for fine-tuning text-to-image models. This dataset contains approximately 1000 images selected to represent the key characteristics of Impressionist art.

## Why This Dataset?
- Optimized size for efficient fine-tuning
- High-quality, curated images
- Balanced representation of subjects
- Focus on core Impressionist characteristics

## Content
- Total Images: ~1000
- Categories:
  - Landscapes (40%)
  - Urban Scenes (20%)
  - Portraits (20%)
  - Nature Close-ups/Still Life (20%)

## Quality Criteria
- Minimum resolution: 1024x1024
- Clear Impressionist techniques
- Varied lighting conditions
- Representative color palettes
- Core Impressionist period (1860-1900)

## Usage
Primarily intended for fine-tuning text-to-image models to generate Impressionist-style artwork.

## Source
Derived from WikiArt dataset, filtered and curated for optimal fine-tuning.
"""

In [None]:
from PIL import Image
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

class ImageProcessor:
    """Handle image preprocessing for the model"""
    def __init__(self, image_size=1024):
        self.image_size = image_size
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])

    def preprocess_image(self, image_path):
        """Preprocess a single image"""
        try:
            image = Image.open(image_path).convert('RGB')
            return self.transform(image)
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            return None

    def validate_image(self, image_path):
        """Validate image file"""
        try:
            with Image.open(image_path) as img:
                img.verify()
            return True
        except:
            return False

# Initialize image processor
image_processor = ImageProcessor()

def validate_dataset_images(dataset):
    """Validate all images in the dataset"""
    print("Validating dataset images...")
    valid_images = 0
    invalid_images = []

    for idx, example in enumerate(tqdm(dataset)):
        image_path = example['image'].filename
        if image_processor.validate_image(image_path):
            valid_images += 1
        else:
            invalid_images.append(image_path)

    print(f"\nValidation complete:")
    print(f"- Valid images: {valid_images}")
    print(f"- Invalid images: {len(invalid_images)}")

    if invalid_images:
        print("\nInvalid images paths:")
        for path in invalid_images[:5]:
            print(f"- {path}")
        if len(invalid_images) > 5:
            print(f"... and {len(invalid_images) - 5} more")

# Validate dataset images
if dataset is not None:
    validate_dataset_images(dataset)