# Import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_theme(style="darkgrid")
import matplotlib.pyplot as plt
import os
from shutil import copyfile

In [11]:
from neuralart.data import *

# Get Data

In [14]:
csv_path = "../raw_data/wikiart"
image_path= "../raw_data/wikiart/wikiart"
sample_path = "../raw_data/wikiart"

In [59]:
data = get_data(csv_path, image_path, create_csv=True)

In [16]:
print(data.shape)
data.head(1)

(81446, 9)


Unnamed: 0,path,movement,artist,title,image,genre,cs-split-genre,cs-split-style,cs-split-artist
0,Early_Renaissance/filippo-lippi_two-saints.jpg,Early_Renaissance,filippo-lippi,two-saints.jpg,filippo-lippi_two-saints.jpg,religious_painting,val,val,


In [24]:
class_ = {"Abstract_Expressionism": None, 
         "Action_painting": None, 
         "Color_Field_Painting": "Abstract",
         "Synthetic_Cubism": "Cubism", 
         "Analytical_Cubism": "Cubism",
         "Rococo": "Baroque",
         "Early_Renaissance": "Renaissance",
         "High_Renaissance": "Renaissance",
         "Mannerism_Late_Renaissance": "Renaissance",
         "Northern_Renaissance": "Renaissance",
         "Post_Impressionism": "Impressionism",
         "Symbolism": "Impressionism"
        }

In [58]:
def get_dataset(data, target="movement", class_=None, n=None, strategy='drop', random_state=123, output_path=None):
    
    data_tmp = data.copy()

    if target == 'genre':
        data_tmp.dropna(axis=0, subset=[target], inplace=True)
        
    
    if class_:
        class2drop = [key for key, val in class_.items() if not val]
        class2keep = {key:val for key, val in class_.items() if val}
        data_tmp = data_tmp[data_tmp[target].apply(lambda x: x  not in class2drop)]
        data_tmp[target] = data_tmp[target].apply(lambda x: class2keep.get(x, x))
        
    if n:
        if strategy=='replace':
            data_tmp = datata_tmp.groupby(by=target).sample(n=n,
                                              random_state=random_state,
                                              replace=True)
        if strategy=='drop':
            class2keep = (data_tmp.groupby(by=target)[target].count() > n).to_dict()
            data_tmp = data_tmp[data_tmp[target].apply(lambda x: class2keep.get(x,False))]
            data_tmp = datata_tmp.groupby(by=target).sample(n=n,
                                              random_state=random_state,
                                              replace=False)
            
        if strategy=='max':
            class2sample  = (data_tmp.groupby(by=target)[target].count() > n).to_dict()
            data2sample = data_tmp[data_tmp[target].apply(lambda x: class2sample.get(x,False))]
            data2keep = data_tmp[data_tmp[target].apply(lambda x: not class2sample.get(x,False))]
            
            data2sample = data2sample.groupby(by=target).sample(n=n, random_state=random_state, 
                                                                replace=False)
            
            data_tmp = pd.concat([data2keep,data2sample])
            
        if output_path:
            save_csv(data_tmp, output_path,f"{os.path.basename(output_path)}-{target}-class_{data_tmp[target].nunique()}-n_{n}.csv")
            

    return data_tmp
        
        
        
        

In [None]:
merge = {"Abstract_Expressionism": "Abstract", 
         "Action_painting": "Abstract", 
         "Color_Field_Painting": "Abstract",
         "Synthetic_Cubism": "Cubism", 
         "Analytical_Cubism": "Cubism",
         "Rococo": "Baroque",
         "Early_Renaissance": "Renaissance",
         "High_Renaissance": "Renaissance",
         "Mannerism_Late_Renaissance": "Renaissance",
         "Northern_Renaissance": "Renaissance",
         "Post_Impressionism": "Impressionism",
         "Symbolism": "Impressionism"
        }

data_merge = get_data(csv_path, image_path, create_csv=True, merge=merge)

# Data Visualization

In [None]:
col = ['cs-split-artist','cs-split-genre','cs-split-style','path']

fig, ax = plt.subplots(1,1,figsize=(15,10))
sns.barplot(y=data[col].count().index, 
            x=data[col].count().values,
            order=data[col].count().sort_values(ascending=False).index,
            ax=ax);

for i, v in enumerate(data[col].count().sort_values(ascending=False).values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_yticklabels(['total','cs-movement (style)','cs-genre','cs-artist']);
ax.set_title("Number of images per target");
ax.set_xlabel("Count");
ax.set_ylabel("Target");

In [None]:
col = ['artist','genre','movement']

fig, ax = plt.subplots(1,1,figsize=(15,10))
sns.barplot(y=data[col].nunique().index, 
            x=data[col].nunique().values,
            order=data[col].nunique().sort_values(ascending=False).index,
            ax=ax);

for i, v in enumerate(data[col].nunique().sort_values(ascending=False).values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of classes per target (all images)");
ax.set_xlabel("Count");
ax.set_ylabel("Target");

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,10));
sns.countplot(data=data, y="movement", 
              order = data['movement'].value_counts().index,
              ax=ax);

for i, v in enumerate(data["movement"].value_counts().values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of images per movement (style)");
ax.set_ylabel("Movement (style)");

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,10));
sns.countplot(data=data_merge, y="movement", 
              order = data_merge['movement'].value_counts().index,
              ax=ax);

for i, v in enumerate(data_merge["movement"].value_counts().values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of images per merged movement");
ax.set_ylabel("Movement (style)");

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,10));
sns.countplot(data=data, y="genre", 
              order = data['genre'].value_counts().index,
              ax=ax);

for i, v in enumerate(data["genre"].value_counts().values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of images per genre");

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,10));
sns.barplot(y=data.groupby("movement").artist.nunique().index, 
            x=data.groupby("movement").artist.nunique().values,
            order=data.groupby("movement").artist.nunique().sort_values(ascending=False).index,
            ax=ax);

for i, v in enumerate(data.groupby("movement").artist.nunique().sort_values(ascending=False).values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of artists per movement");
ax.set_xlabel("Count");

In [None]:
data2 = data[data["cs-split-genre"].notnull()]

fig, ax = plt.subplots(1,1,figsize=(15,10));
sns.countplot(data=data2, y="movement", 
              order = data2['movement'].value_counts().index,
              ax=ax);

for i, v in enumerate(data2["movement"].value_counts().values):
    ax.text(v, i , str(v), color='blue', fontweight='bold')
    
ax.set_title("Number of images with genre per movement (style)");
ax.set_ylabel("Movement (style)");

In [None]:
data[data["cs-split-genre"].notnull()][data["movement"]=="Abstract_Expressionism"].head(100)

In [None]:
def show_samples(df,sample_size=10):
    root_path='../raw_data/wikiart/wikiart/'
    artist=df['artist']
    movement=df['movement']
    title=df['title']
    folder_path=list(df['path'].sample(n=sample_size))
    complete_path=[root_path + i for i in folder_path]
    fig = plt.figure(constrained_layout=True,figsize=(10,5*sample_size))
    for i in list(range(0,sample_size)):
        plt.subplot(sample_size,1,i+1)
        image=plt.imread(complete_path[i])
        label=f'{movement[i]} - {title[i]} by {artist[i]}'
        plt.text(x=10,y=-2,s=label)
        plt.imshow(image)