In [1]:
import pandas as pd
import glob
import os
import re
import numpy as np
import pathlib
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import Sequential, layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
%load_ext autoreload

In [2]:
df_yourpaintings = pd.read_csv("../raw_data/yourpaintings/painting_dataset_2021.csv")

In [3]:
df_yourpaintings.head()

Unnamed: 0,Image URL,Web page URL,Subset,Labels
0,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/and-the-co...,'test',' cow'
1,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/0-6-00-6-0...,'train',' train'
2,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/044t-locom...,'train',' train'
3,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/080-locomo...,'test',' train'
4,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/17th-and-2...,'test',' horse'


In [4]:
df_yourpaintings['Labels'].nunique()

89

# Try to find number of unique classes

In [5]:
classes_list = []
words_pattern = '[a-z]+'

for index, row in df_yourpaintings.iterrows():
    text = row["Labels"]
    words_list = re.findall(words_pattern, text, flags=re.IGNORECASE)
    for word in words_list:
        classes_list.append(word)

unique_classes = set(classes_list)
print(f"The list of unique class labels contains {len(unique_classes)} labels, namely: {unique_classes}")

The list of unique class labels contains 10 labels, namely: {'bird', 'diningtable', 'dog', 'aeroplane', 'horse', 'train', 'boat', 'cow', 'sheep', 'chair'}


## Encode data for the relevant categories

### Start by adding the list of labels to a `label_classes` column

In [6]:
for index, row in df_yourpaintings.iterrows():
    text = row["Labels"]
    words_list = re.findall(words_pattern, text, flags=re.IGNORECASE)
    df_yourpaintings.at[index, "label_classes"] = words_list

In [7]:
print(type(df_yourpaintings.loc[10]['label_classes']))
print(df_yourpaintings.loc[10]['label_classes'])
print(df_yourpaintings.loc[15]['label_classes'])

<class 'list'>
['aeroplane', 'horse']
['chair', 'diningtable', 'dog']


### Apply MultipleLabelBinarizer on the `label_classes` column

In [8]:
mlb = MultiLabelBinarizer()

test = df_yourpaintings['label_classes']

res = pd.DataFrame(mlb.fit_transform(test),
                   columns=mlb.classes_,
                   index=test.index)

res

Unnamed: 0,aeroplane,bird,boat,chair,cow,diningtable,dog,horse,sheep,train
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
8624,0,0,1,0,0,0,0,0,0,0
8625,0,0,1,0,0,0,0,0,0,0
8626,0,1,0,0,0,0,0,0,0,0
8627,0,1,0,0,0,0,0,0,0,0


# Preprocess the images

In [None]:
#make sure images are all of the same size and apply some basic preprocessing

#os.chdir("../raw_data/yourpaintings")

imgs = []
img_path = "../raw_data/yourpaintings"

for file in os.listdir(img_path):
    if file.endswith(".jpg"):
        image = Image.open(os.path.join(img_path, file))
        image = image.resize((256, 256))
        imgs.append(np.array(image))

In [None]:

#os.chdir("../raw_data/yourpaintings")
#for file in glob.glob("*.jpg"):
#    print(file)
data_path = "../raw_data/yourpaintings"
images_path = [elt for elt in os.listdir(os.path.join(data_path, cl)) if elt.find('.jpg')>0]
classes = {'daisy':0, 'dandelion':1, 'rose':2}
imgs = []
labels = []
# for (cl, i) in classes.items():
#     images_path = [elt for elt in os.listdir(os.path.join(data_path, cl)) if elt.find('.jpg')>0]
#     for img in tqdm(images_path[:300]):
#         path = os.path.join(data_path, cl, img)
#         if os.path.exists(path):
#             image = Image.open(path)
#             image = image.resize((256, 256))
#             imgs.append(np.array(image))
#             labels.append(i)

# X = np.array(imgs)
# num_classes = len(set(labels))
# y = to_categorical(labels, num_classes)

# # Finally we shuffle:
# p = np.random.permutation(len(X))
# X, y = X[p], y[p]

# first_split = int(len(imgs) /6.)
# second_split = first_split + int(len(imgs) * 0.2)
# X_test, X_val, X_train = X[:first_split], X[first_split:second_split], X[second_split:]
# y_test, y_val, y_train = y[:first_split], y[first_split:second_split], y[second_split:]
    
# return X_train, y_train, X_val, y_val, X_test, y_test, num_classes

# Build a basic CNN architecture for benchmarking

In [None]:
#random architecture
benchmark_model = Sequential()
# Input here is 4D array (batchsize, height, width, channels) - we have already created the train_generator with batch size 32
# 32 Images of size each 150x150 with 3 color channels will be input into this layer
benchmark_model.add(layers.Conv2D(128, kernel_size=7, activation='relu', input_shape=(150,150,3)))
benchmark_model.add(layers.MaxPooling2D(pool_size=(4,4), strides=(2,2)))
benchmark_model.add(layers.Conv2D(64, kernel_size=5, activation='relu'))
benchmark_model.add(layers.MaxPooling2D(pool_size=(4,4), strides=(2,2)))
benchmark_model.add(layers.Flatten())
benchmark_model.add(layers.Dense(128,activation='relu'))
benchmark_model.add(layers.Dense(6,activation='softmax'))
benchmark_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
benchmark_model.summary()

## Load VGG16 as baseline model

In [None]:
def load_own_model():

    model = Sequential()
    model.add(Rescaling(1./255, input_shape=(256,256,3)))

    model.add(layers.Conv2D(16, kernel_size=10, activation='relu'))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Conv2D(32, kernel_size=8, activation="relu"))
    model.add(layers.MaxPooling2D(3))

    model.add(layers.Conv2D(32, kernel_size=6, activation="relu"))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))
    
    opt = optimizers.Adam(learning_rate=1e-4)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

In [None]:
model_homemade = load_own_model()
model_homemade.summary()

In [None]:
es = EarlyStopping(monitor = 'val_accuracy', 
                   mode = 'max', 
                   patience = 5, 
                   verbose = 1, 
                   restore_best_weights = True)

history = model_homemade.fit(X_train, y_train,
                             validation_data = (X_val, y_val),
                             batch_size = 16, 
                             epochs = 100, 
                             callbacks=[es])

In [None]:
res = model_homemade.evaluate(X_test, y_test)
res

In [None]:
test_accuracy = res[-1]
print(f"test_accuracy = {round(test_accuracy,2)*100} %")