In [None]:
import pandas as pd

# Path to CSV art dataset description file
csv_file_path = "../datasets/art_dataset/artists.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

df.head(15)


In [None]:
# Split multiple genres and create a list of all genres
all_genres = [genre.strip() for sublist in df['genre'].str.split(',') for genre in sublist]

# Get unique genres
unique_genres = set(all_genres)

print("All possible genres:")
for genre in unique_genres:
    print(genre)

In [None]:
import os 
import glob

# Process artist names to match the folder names in the directory
def preprocess_artist_names(artists):
    return [artist.replace(' ','_') for artist in artists]

# Get artists by genre
def get_artists_by_genre(df, genre, verbose = False):
    artists = df[df['genre'].str.contains(genre, na=False)]['name'].to_list()
    processed_artists = preprocess_artist_names(artists)
    if verbose:
        print(f"Artists with {genre} works:")
        print(processed_artists)
    return processed_artists

# Get images for artists of a specific style
def get_style_images(images_path, artists):
    all_images = []
    for artist_name in artists:
        artist_directory = os.path.join(images_path, artist_name)
        artist_images = glob.glob(os.path.join(artist_directory, '*.jpg'))
        all_images.extend(artist_images)
    return all_images

# Get artists by genre
expressionism_artists = get_artists_by_genre(df, 'Expressionism', verbose= True)
surrealism_artists = get_artists_by_genre(df, 'Surrealism', verbose= True)
abstractionism_artists = get_artists_by_genre(df, 'Abstractionism', verbose= True)

In [None]:
import shutil
import random

def create_full_balanced_ds(unique_genres, images_path, total_images, val_size):
    """Create the train/val splits datasets from all styles 
       where each style is equally 
       represented"""
    all_artists = []
    total_styles = len(unique_genres)
    
    # Get artists for each genre
    for genre in unique_genres:
        genre_artists = get_artists_by_genre(df, genre)
        all_artists.append(genre_artists)
    
    train_dataset = []
    destination_dir = "../datasets/art_dataset/train"
    destination_val_dir = "../datasets/art_dataset/validation"
    
    # Check if the destination directory already exists
    if os.path.exists(destination_dir):
        raise FileExistsError(f"Destination directory '{destination_dir}' already exists.")
    else:
        os.makedirs(destination_dir)
    
    # Same for validation
    if os.path.exists(destination_val_dir):
        raise FileExistsError(f"Destination directory '{destination_val_dir}' already exists.")
    else:
        os.makedirs(destination_val_dir)

    # Calculate number of images to sample from each style
    num_images_per_style = total_images // total_styles
    
    for artists in all_artists:
        style_images = get_style_images(images_path, artists)
        
        # Sample an equal proportion of images from each style
        sampled_images = random.sample(style_images, min(num_images_per_style, len(style_images)))
        
        train_dataset.extend(sampled_images)

    # Shuffle training set to ensure randomnes when picking validation data
    random.shuffle(train_dataset)

    # Create the validation dataset from the training set    
    val_dataset = train_dataset[:val_size]
    train_dataset = train_dataset[val_size:]

    # Copy images from full dataset to destination directory
    for image_path in train_dataset:
        image_name = os.path.basename(image_path)
        destination_path = os.path.join(destination_dir, image_name)
        shutil.copyfile(image_path, destination_path)

    # Copy images from validation dataset to destination directory
    for image_path in val_dataset:
        image_name = os.path.basename(image_path)
        destination_path = os.path.join(destination_val_dir, image_name)
        shutil.copyfile(image_path, destination_path)

    print(f"Training dataset created with {len(train_dataset)} images!")
    print(f"Validation dataset created with {len(val_dataset)} images!")

# Create a training and validation data with equally represented art styles
# We will use 200 images only for validation due to lack of more training data 
# for pre-training our PetsGAN.
create_full_balanced_ds(unique_genres, '../datasets/art_dataset/images', 3680, 200)