In [4]:
from google.colab import drive

In [5]:
import pandas as pd
import numpy as np
import os
import re

In [6]:
# Step 1: Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
painters_to_keep = [
    'claude monet', 'pierre auguste renoir', 'vincent van gogh',
    'paul cezanne', 'pablo picasso', 'georges braque',
    'salvador dali', 'rene magritte'
]

In [None]:
# Define the dataset path
dataset_path = '/content/drive/MyDrive/colab_data/wikiart_dataset'

# Initialize data storage
data = {'painting': [], 'normalized_painting': [], 'genre': []}

# Traverse through the directory structure
for root, _, files in os.walk(dataset_path):
    for filename in files:
        # Check if it's a file
        if os.path.isfile(os.path.join(root, filename)):
            # Get the genre as the subfolder name directly under `wikiart_dataset`
            genre = os.path.basename(os.path.normpath(root)).lower()

            # Append the original painting name
            data['painting'].append(filename)

            # Apply the regex function to normalize the painting name
            normalized_name = re.sub(r'[^a-zA-Z0-9]', ' ', filename.lower())
            data['normalized_painting'].append(normalized_name)

            # Append the genre
            data['genre'].append(genre)

# Create a DataFrame from the collected data
data_df = pd.DataFrame(data)

# Display the DataFrame
print(data_df.head())

                                            painting  \
0                   albert-gleizes_acrobats-1916.jpg   
1  albert-gleizes_portrait-of-igor-stravinsky-191...   
2         albert-gleizes_woman-with-animals-1914.jpg   
3                       georges-braque_a-girl(1).jpg   
4          georges-braque_bottle-and-fishes-1910.jpg   

                                 normalized_painting              genre  
0                   albert gleizes acrobats 1916 jpg  analytical_cubism  
1  albert gleizes portrait of igor stravinsky 191...  analytical_cubism  
2         albert gleizes woman with animals 1914 jpg  analytical_cubism  
3                       georges braque a girl 1  jpg  analytical_cubism  
4          georges braque bottle and fishes 1910 jpg  analytical_cubism  


In [None]:
# Save the DataFrame to a CSV file
output_path = '/content/drive/MyDrive/colab_data/painting_data.csv'
data_df.to_csv(output_path, index=False)

print(f"Data saved to {output_path}")

Data saved to /content/drive/MyDrive/colab_data/painting_data.csv


In [None]:
# Create a regex pattern from the painters_to_keep list
painters_pattern = '|'.join([painter.replace(' ', '.*') for painter in painters_to_keep])

# Filter rows where 'painter' column matches any painter in painters_to_keep
filtered_df = data_df[data_df['normalized_painting'].str.contains(painters_pattern, case=False, na=False)]


# Display the filtered DataFrame
print(filtered_df)

                                                painting  \
3                           georges-braque_a-girl(1).jpg   
4              georges-braque_bottle-and-fishes-1910.jpg   
5       georges-braque_castle-at-la-roche-guyon-1909.jpg   
6      georges-braque_clarinet-and-bottle-of-rum-on-a...   
7            georges-braque_fruitdish-and-glass-1912.jpg   
...                                                  ...   
58691  salvador-dali_vir-et-mulier-in-paradiso-volupt...   
58692    salvador-dali_proelium-magnum-in-caelo-1967.jpg   
58693  salvador-dali_planctus-david-in-mortem-saul-19...   
58694  salvador-dali_rhinocerotic-portrait-of-vermeer...   
58695    sam-francis_hommage-a-vincent-van-gogh-1989.jpg   

                                     normalized_painting  \
3                           georges braque a girl 1  jpg   
4              georges braque bottle and fishes 1910 jpg   
5       georges braque castle at la roche guyon 1909 jpg   
6      georges braque clarinet and bott

In [None]:
filtered_df.shape

(6561, 3)

In [None]:
filtered_df.head()

Unnamed: 0,painting,normalized_painting,genre
3,georges-braque_a-girl(1).jpg,georges braque a girl 1 jpg,analytical_cubism
4,georges-braque_bottle-and-fishes-1910.jpg,georges braque bottle and fishes 1910 jpg,analytical_cubism
5,georges-braque_castle-at-la-roche-guyon-1909.jpg,georges braque castle at la roche guyon 1909 jpg,analytical_cubism
6,georges-braque_clarinet-and-bottle-of-rum-on-a...,georges braque clarinet and bottle of rum on a...,analytical_cubism
7,georges-braque_fruitdish-and-glass-1912.jpg,georges braque fruitdish and glass 1912 jpg,analytical_cubism


In [None]:
# Define a function to find the painter's name in each row
def find_painter(normalized_name):
    for painter in painters_to_keep:
        if painter.replace(' ', '') in normalized_name.replace(' ', ''):
            return painter
    return None

In [None]:
# Apply the function to create a 'painter' column
filtered_df['painter'] = filtered_df['normalized_painting'].apply(lambda x: find_painter(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['painter'] = filtered_df['normalized_painting'].apply(lambda x: find_painter(x))


In [None]:
filtered_df.head()

Unnamed: 0,painting,normalized_painting,genre,painter
3,georges-braque_a-girl(1).jpg,georges braque a girl 1 jpg,analytical_cubism,georges braque
4,georges-braque_bottle-and-fishes-1910.jpg,georges braque bottle and fishes 1910 jpg,analytical_cubism,georges braque
5,georges-braque_castle-at-la-roche-guyon-1909.jpg,georges braque castle at la roche guyon 1909 jpg,analytical_cubism,georges braque
6,georges-braque_clarinet-and-bottle-of-rum-on-a...,georges braque clarinet and bottle of rum on a...,analytical_cubism,georges braque
7,georges-braque_fruitdish-and-glass-1912.jpg,georges braque fruitdish and glass 1912 jpg,analytical_cubism,georges braque


In [None]:
filtered_df['painter'].unique()

array(['georges braque', 'pablo picasso', 'paul cezanne', 'rene magritte',
       'salvador dali', 'pierre auguste renoir', 'claude monet',
       'vincent van gogh', None], dtype=object)

In [None]:
filtered_df.shape

(6561, 4)

In [None]:
# Save the DataFrame to a CSV file
output_path = '/content/drive/MyDrive/colab_data/filtered_data.csv'
filtered_df.to_csv(output_path, index=False)

print(f"Data saved to {output_path}")

Data saved to /content/drive/MyDrive/colab_data/filtered_data.csv


Before we delete the paintings, we'll count the number of paintings left in the folder.

In [None]:
# Define the path to your dataset folder
dataset_path = '/content/drive/MyDrive/colab_data/wikiart_dataset'
# Get the list of filenames to keep from 'filtered_df'
filenames_to_keep = set(filtered_df['painting'].tolist())  # Convert to set for faster lookups
delete=[]
# Traverse through the directory structure
for root, _, files in os.walk(dataset_path):
    for filename in files:
      # Check if the file is not in filenames_to_keep
        if filename not in filenames_to_keep:
           file_path = os.path.join(root, filename)  # Use 'root' to get the correct path
           delete.append(file_path)
           #os.remove(file_path)  # Delete the file
           #print(f"Deleted: {file_path}")  # Optional: print deleted files

#print("Files not in filtered_df have been deleted.")


In [None]:
len(delete)

52135

We count the total number of paintings.

In [None]:
file_count = 0
for root, _, files in os.walk(dataset_path):
    for filename in files:
        # Check if it's a file
        if os.path.isfile(os.path.join(root, filename)):
            file_count += 1

In [None]:
print(file_count)

58696


In [None]:
filtered_df.shape

(6561, 4)

The numbers mathc so we can delete the paintings.

In [None]:
# Define the path to your dataset folder
dataset_path = '/content/drive/MyDrive/colab_data/wikiart_dataset'
# Get the list of filenames to keep from 'filtered_df'
filenames_to_keep = set(filtered_df['painting'].tolist())  # Convert to set for faster lookups
delete=[]
# Traverse through the directory structure
for root, _, files in os.walk(dataset_path):
    for filename in files:
      # Check if the file is not in filenames_to_keep
        if filename not in filenames_to_keep:
           file_path = os.path.join(root, filename)  # Use 'root' to get the correct path
           os.remove(file_path)  # Delete the file
           print(f"Deleted: {file_path}")  # Optional: print deleted files

print("Files not in filtered_df have been deleted.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_stormy-day-1897.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_street-in-yalta-1886.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_stubbled-field.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_study-to-above-the-eternal-tranquility-1892.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_summer-evening-edge-of-village-1899.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_summer-evening-river.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_summer-evening.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realism/isaac-levitan_sunny-autumn-day-1897.jpg
Deleted: /content/drive/MyDrive/colab_data/wikiart_dataset/Realis

We'll count the number of paintings left in the folder.

In [None]:
file_count = 0
for root, _, files in os.walk(dataset_path):
    for filename in files:
        # Check if it's a file
        if os.path.isfile(os.path.join(root, filename)):
            file_count += 1

In [None]:
print(file_count)

6561
