In [1]:
import pandas as pd
import glob
import os
import re
import numpy as np
import pathlib
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import Sequential, layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
%load_ext autoreload

In [2]:
df_yourpaintings = pd.read_csv("../raw_data/yourpaintings/painting_dataset_2021.csv")

In [3]:
df_yourpaintings.head()

Unnamed: 0,Image URL,Web page URL,Subset,Labels
0,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/and-the-co...,'test',' cow'
1,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/0-6-00-6-0...,'train',' train'
2,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/044t-locom...,'train',' train'
3,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/080-locomo...,'test',' train'
4,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/17th-and-2...,'test',' horse'


In [4]:
df_yourpaintings['Labels'].nunique()

89

# Try to find number of unique classes

In [5]:
classes_list = []
words_pattern = '[a-z]+'

for index, row in df_yourpaintings.iterrows():
    text = row["Labels"]
    words_list = re.findall(words_pattern, text, flags=re.IGNORECASE)
    for word in words_list:
        classes_list.append(word)

unique_classes = set(classes_list)
print(f"The list of unique class labels contains {len(unique_classes)} labels, namely: {unique_classes}")

The list of unique class labels contains 10 labels, namely: {'aeroplane', 'sheep', 'train', 'chair', 'dog', 'boat', 'diningtable', 'bird', 'horse', 'cow'}


## Encode data for the relevant categories

### Start by adding the list of labels to a `label_classes` column

In [6]:
for index, row in df_yourpaintings.iterrows():
    text = row["Labels"]
    words_list = re.findall(words_pattern, text, flags=re.IGNORECASE)
    df_yourpaintings.at[index, "label_classes"] = words_list

In [7]:
print(type(df_yourpaintings.loc[10]['label_classes']))
print(df_yourpaintings.loc[10]['label_classes'])
print(df_yourpaintings.loc[15]['label_classes'])

<class 'list'>
['aeroplane', 'horse']
['chair', 'diningtable', 'dog']


### Apply MultipleLabelBinarizer on the `label_classes` column

In [8]:
mlb = MultiLabelBinarizer()

test = df_yourpaintings['label_classes']

res = pd.DataFrame(mlb.fit_transform(test),
                   columns=mlb.classes_,
                   index=test.index)

res

Unnamed: 0,aeroplane,bird,boat,chair,cow,diningtable,dog,horse,sheep,train
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
8624,0,0,1,0,0,0,0,0,0,0
8625,0,0,1,0,0,0,0,0,0,0
8626,0,1,0,0,0,0,0,0,0,0
8627,0,1,0,0,0,0,0,0,0,0


# Preprocess the images

In [9]:
#make sure images are all of the same size and apply some basic preprocessing

#os.chdir("../raw_data/yourpaintings")

imgs = []
img_path = "../raw_data/yourpaintings"

for file in os.listdir(img_path):
    if file.endswith(".jpg"):
        image = Image.open(os.path.join(img_path, file))
        image = image.resize((256, 256))
        imgs.append(np.array(image))

In [33]:
X = np.array(imgs)
X.shape

(8247, 256, 256, 3)

In [14]:
merged = pd.concat([df_yourpaintings, res], axis=1)
merged['Image URL']
#fname = url[url.rfind('/')+1:]

0    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
1    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
2    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
3    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
4    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
Name: Image URL, dtype: object

In [20]:
merged["Image URL"] = merged["Image URL"].map(str)

In [135]:
merged.columns

Index(['Image URL', 'Web page URL', 'Subset', 'Labels', 'label_classes',
       'aeroplane', 'bird', 'boat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'sheep', 'train'],
      dtype='object')

In [146]:
df_data = pd.DataFrame()

In [None]:
# should just create a new column for X and y with filename.jpg

In [185]:
def get_your_paintings():
    PATH = pathlib.Path("../raw_data")
    yourpaintings_path = PATH/"yourpaintings"
    yourpaintings_path.mkdir(parents=True, exist_ok=True)
    urls = df_yourpaintings['Image URL'].tolist(); urls[:5]
    for url in urls[:5]:
        fname = url[url.rfind('/')+1:]
        print(fname)
    return urls, yourpaintings_path

urls, yourpaintings_path = get_your_paintings()
urls[:20]
urls = pd.Series(df_yourpaintings['Image URL'], dtype="string")
urls.dtype
urls.head()

NID_QUB_QUB_264-001.jpg
GMIII_MOSI_A1978_72_3-001.jpg
NY_NRM_1979_7964-001.jpg
CHE_CRHC_PCF40-001.jpg
NOT_NTMAG_1997_31-001.jpg


0    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
1    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
2    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
3    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
4    https://d3d00swyhr67nd.cloudfront.net/w1200h12...
Name: Image URL, dtype: string

In [194]:
df_yourpaintings['Image URL'] = df_yourpaintings['Image URL'].astype("str")
print(df_yourpaintings.shape)

(8629, 5)


In [193]:
df_yourpaintings['Image URL'].value_counts()

https://d3d00swyhr67nd.cloudfront.net/w1200h1200/collection/OU/BLLI/OU_BLLI_4-001.jpg               193
nan                                                                                                 176
https://d3d00swyhr67nd.cloudfront.net/w1200h1200/collection/NID/QUB/NID_QUB_QUB_264-001.jpg           1
https://d3d00swyhr67nd.cloudfront.net/w1200h1200/collection/BCN/BHT/BCN_BHT_04-001.jpg                1
https://d3d00swyhr67nd.cloudfront.net/w1200h1200/collection/NY/MAG/NY_MAG_HARAG_41-001.jpg            1
                                                                                                   ... 
https://d3d00swyhr67nd.cloudfront.net/w944h944/collection/GAC/GAC/GAC_GAC_14965-001.jpg               1
https://d3d00swyhr67nd.cloudfront.net/w944h944/collection/STF/STKMG/STF_STKMG_1967_FA_37-001.jpg      1
https://d3d00swyhr67nd.cloudfront.net/w944h944/collection/TATE/TATE/TATE_TATE_T06878_10-001.jpg       1
https://d3d00swyhr67nd.cloudfront.net/w800h800/collection/STC/ED

In [171]:
test_lst = []
for url in urls:
    fname = url[url.rfind('/')+1:]
    test_lst.append(url)
    
print(len(test_lst))

8629


In [195]:
files_lst = []
for file in os.listdir(img_path):
    if file.endswith(".jpg"):
        files_lst.append(file)
        
print(type(files_lst[0]))
print(len(files_lst))

<class 'str'>
8247


In [153]:
for i in range(len(files_lst)):
    row = merged[merged['Image URL'].str.contains(files_lst[i])]
    df_data = pd.concat([df_data, row], axis=0, ignore_index=True)

In [157]:
df_data

Unnamed: 0,Image URL,Web page URL,Subset,Labels,label_classes,aeroplane,bird,boat,chair,cow,diningtable,dog,horse,sheep,train
0,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/the-bridge...,'test',' cow',[cow],0,0,0,0,1,0,0,0,0,0
1,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/horses-in-...,'test',' horse',[horse],0,0,0,0,0,0,0,1,0,0
2,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/an-enormou...,'test',' sheep',[sheep],0,0,0,0,0,0,0,0,1,0
3,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/still-life...,'test',' diningtable',[diningtable],0,0,0,0,0,1,0,0,0,0
4,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/chinese-bo...,'test',' boat',[boat],0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8434,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/landscape-...,'validation',' sheep',[sheep],0,0,0,0,0,0,0,0,1,0
8435,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/study-for-...,'test',' horse',[horse],0,0,0,0,0,0,0,1,0,0
8436,https://d3d00swyhr67nd.cloudfront.net/w1200h12...,https://artuk.org/discover/artworks/still-life...,'test',' diningtable',[diningtable],0,0,0,0,0,1,0,0,0,0
8437,https://d3d00swyhr67nd.cloudfront.net/w944h944...,https://artuk.org/discover/artworks/horse-in-a...,'train',' horse',[horse],0,0,0,0,0,0,0,1,0,0


In [156]:
#df_data = pd.DataFrame()
df_data
print(df_data.shape)
print(X.shape)
print(len(files_lst))

(8439, 15)
(8247, 256, 256, 3)
8247


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
def load_own_model():

    model = Sequential()
    model.add(Rescaling(1./255, input_shape=(256,256,3)))

    model.add(layers.Conv2D(16, kernel_size=10, activation='relu'))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Conv2D(32, kernel_size=8, activation="relu"))
    model.add(layers.MaxPooling2D(3))

    model.add(layers.Conv2D(32, kernel_size=6, activation="relu"))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(10, activation='sigmoid'))
    
    opt = optimizers.Adam(learning_rate=1e-4)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

#model.fit(X_train, y_train, verbose=0, epochs=100, validation_split=0.2)

es = EarlyStopping(monitor = 'val_accuracy', 
                   mode = 'max', 
                   patience = 5, 
                   verbose = 1, 
                   restore_best_weights = True)

history = model.fit(X_train, y_train,
                             validation_split=0.2,
                             batch_size = 16, 
                             epochs = 100, 
                             callbacks=[es])

NameError: name 'cl' is not defined

In [None]:
from sklearn.metrics import accuracy_score
yhat = model.predict(X_test)
yhat = yhat.round()
# calculate accuracy
acc = accuracy_score(y_test, yhat)
# store result
print('>%.3f' % acc)
results.append(acc)

# Build a basic CNN architecture for benchmarking

In [None]:
#random architecture
benchmark_model = Sequential()
# Input here is 4D array (batchsize, height, width, channels) - we have already created the train_generator with batch size 32
# 32 Images of size each 150x150 with 3 color channels will be input into this layer
benchmark_model.add(layers.Conv2D(128, kernel_size=7, activation='relu', input_shape=(150,150,3)))
benchmark_model.add(layers.MaxPooling2D(pool_size=(4,4), strides=(2,2)))
benchmark_model.add(layers.Conv2D(64, kernel_size=5, activation='relu'))
benchmark_model.add(layers.MaxPooling2D(pool_size=(4,4), strides=(2,2)))
benchmark_model.add(layers.Flatten())
benchmark_model.add(layers.Dense(128,activation='relu'))
benchmark_model.add(layers.Dense(6,activation='softmax'))
benchmark_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
benchmark_model.summary()

## Load VGG16 as baseline model

In [None]:
def load_own_model():

    model = Sequential()
    model.add(Rescaling(1./255, input_shape=(256,256,3)))

    model.add(layers.Conv2D(16, kernel_size=10, activation='relu'))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Conv2D(32, kernel_size=8, activation="relu"))
    model.add(layers.MaxPooling2D(3))

    model.add(layers.Conv2D(32, kernel_size=6, activation="relu"))
    model.add(layers.MaxPooling2D(3))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))
    
    opt = optimizers.Adam(learning_rate=1e-4)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    return model

In [None]:
model_homemade = load_own_model()
model_homemade.summary()

In [None]:
es = EarlyStopping(monitor = 'val_accuracy', 
                   mode = 'max', 
                   patience = 5, 
                   verbose = 1, 
                   restore_best_weights = True)

history = model_homemade.fit(X_train, y_train,
                             validation_data = (X_val, y_val),
                             batch_size = 16, 
                             epochs = 100, 
                             callbacks=[es])

In [None]:
res = model_homemade.evaluate(X_test, y_test)
res

In [None]:
test_accuracy = res[-1]
print(f"test_accuracy = {round(test_accuracy,2)*100} %")