# Introdution

# Read Data

Make sure to have the data locally locally saved as `../cs254-final/project/data`.

In [1]:
# Import libraries
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import os
import glob
from keras.callbacks import EarlyStopping, ModelCheckpoint
from PIL import Image
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras import regularizers, optimizers
from keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator, array_to_img,img_to_array,load_img
from keras import backend as K
from sklearn.preprocessing import OneHotEncoder
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D, Concatenate, BatchNormalization
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

In [2]:
artists = pd.read_csv('../cs254-final-project/data/artists.csv')
print(artists.shape)
# print(artists)

(50, 8)


# Data Processing

The following approach is a modified version of the approach from [DeepArtists: Identify Artist from Art](https://www.kaggle.com/supratimhaldar/deepartist-identify-artist-from-art).

In [3]:
artists = pd.read_csv('../cs254-final-project/data/artists.csv')
print(artists.shape)
# print(artists)

In [4]:
# Creating a dataframe with the top 10 artists by number of paintings
artists_top = artists.head(10)
artists_top

Unnamed: 0,id,name,years,genre,nationality,bio,wikipedia,paintings
8,8,Vincent van Gogh,1853 – 1890,Post-Impressionism,Dutch,Vincent Willem van Gogh (Dutch: [ˈvɪnsɛnt ˈʋɪl...,http://en.wikipedia.org/wiki/Vincent_van_Gogh,877
30,30,Edgar Degas,1834 - 1917,Impressionism,French,Edgar Degas (US: or UK: ; born Hilaire-Germai...,http://en.wikipedia.org/wiki/Edgar_Degas,702
13,13,Pablo Picasso,1881 - 1973,Cubism,Spanish,Pablo Ruiz Picasso (; Spanish: [ˈpaβlo piˈkaso...,http://en.wikipedia.org/wiki/Pablo_Picasso,439
15,15,Pierre-Auguste Renoir,1841 - 1919,Impressionism,French,"Pierre-Auguste Renoir, commonly known as Augus...",http://en.wikipedia.org/wiki/Pierre-Auguste_Re...,336
19,19,Albrecht Dürer,1471 - 1528,Northern Renaissance,German,Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; ...,http://en.wikipedia.org/wiki/Albrecht_Dürer,328
46,46,Paul Gauguin,1848 – 1903,"Symbolism,Post-Impressionism",French,"Eugène Henri Paul Gauguin (UK: , US: ; French:...",http://en.wikipedia.org/wiki/Paul_Gauguin,311
16,16,Francisco Goya,1746 - 1828,Romanticism,Spanish,Francisco José de Goya y Lucientes (; Spanish:...,http://en.wikipedia.org/wiki/Francisco_Goya,291
31,31,Rembrandt,1606 - 1669,Baroque,Dutch,Rembrandt Harmenszoon van Rijn (; Dutch: [ˈrɛm...,http://en.wikipedia.org/wiki/Rembrandt,262
20,20,Alfred Sisley,1839 - 1899,Impressionism,"French,British",Alfred Sisley (; French: [sislɛ]; 30 October 1...,http://en.wikipedia.org/wiki/Alfred_Sisley,259
32,32,Titian,1488 - 1576,"High Renaissance,Mannerism",Italian,Tiziano Vecelli or Tiziano Vecellio (pronounce...,http://en.wikipedia.org/wiki/Titian,255


## Exploring data and creating a DataFrame of all image paths with their associated artists

In [5]:
# Images
images_dir = '../cs254-final-project/data/images/images'
artists_dir = os.listdir(images_dir) # Files are named after each artists

In [6]:
# Images DataFrame
artists_top_name = artists_top['name'].str.replace(' ', '_').values

images_df = pd.DataFrame()
for name in artists_top_name:
    # print(glob.glob('../cs254-final-project/data/images/images/' + name + '/*'))

    # Method 1:
    #
    # images_df = images_df.append(pd.DataFrame(data={'Path': glob.glob('../cs254-final-project/data/images/images/' + name + '/*'), 'Name': name}), ignore_index=True)

    # Method 2:
    #
    images_df = pd.concat([images_df, pd.DataFrame(data={'Path': glob.glob('../cs254-final-project/data/images/images/' + name + '/*'), 'Name': name})], ignore_index=True)

images_df

Unnamed: 0,Path,Name
0,../cs254-final-project/data/images/images/Vinc...,Vincent_van_Gogh
1,../cs254-final-project/data/images/images/Vinc...,Vincent_van_Gogh
2,../cs254-final-project/data/images/images/Vinc...,Vincent_van_Gogh
3,../cs254-final-project/data/images/images/Vinc...,Vincent_van_Gogh
4,../cs254-final-project/data/images/images/Vinc...,Vincent_van_Gogh
...,...,...
3727,../cs254-final-project/data/images/images/Titi...,Titian
3728,../cs254-final-project/data/images/images/Titi...,Titian
3729,../cs254-final-project/data/images/images/Titi...,Titian
3730,../cs254-final-project/data/images/images/Titi...,Titian


In [7]:
#Create Generator


BATCH_SIZE = 64

#image dimensions?
img_width, img_height = 277,277

train_df = images_df.sample(frac=0.8,random_state=200)
test_df = images_df.drop(train_df.index)




if K.image_data_format() == 'channels_first':
    input_shape = (3,img_width,img_height)
else:
    input_shape = (img_width, img_height,3)
    
#instantiate neural network

#Train

train_generator = ImageDataGenerator(rescale=1.0/255,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True,
        fill_mode="nearest",
        validation_split=0.15,
        preprocessing_function = preprocess_input
                                    
)

test_generator = ImageDataGenerator(rescale=1.0/255,preprocessing_function=preprocess_input)


train_gen = train_generator.flow_from_dataframe(
        train_df,
        shuffle=True,
        x_col = 'Path',
        y_col = 'Name',
        class_mode = 'categorical',
        subset="training",
        batch_size=BATCH_SIZE,
        target_size = (img_width,img_height),
        seed=42
)

valid_gen= train_generator.flow_from_dataframe(
        train_df,
        subset="validation",
        shuffle=True,
        x_col = 'Path',
        y_col = 'Name',
        class_mode = 'categorical',
        batch_size=BATCH_SIZE,
        target_size = (img_width,img_height),
        seed=42
)

test_gen = test_generator.flow_from_dataframe(
        test_df,
        x_col = 'Path',
        batch_size = 1,
        shuffle = False,
        class_mode = None,
        target_size = (img_width,img_height)
)


Found 2539 validated image filenames belonging to 9 classes.
Found 447 validated image filenames belonging to 9 classes.
Found 746 validated image filenames.


In [8]:
def create_model(input_shape, n_classes, optimizer='rmsprop', fine_tune=0):
    
    conv_base = VGG16(include_top=False,
                     weights='imagenet', 
                     input_shape=input_shape)
    if fine_tune > 0:
        for layer in conv_base.layers[:-fine_tune]:
            layer.trainable = False
    else:
        for layer in conv_base.layers:
            layer.trainable = False
            
    top_model = conv_base.output
    top_model = Flatten(name="flatten")(top_model)
    top_model = Dense(4096, activation='relu')(top_model)
    top_model = Dense(1072, activation='relu')(top_model)
    top_model = Dropout(0.2)(top_model)
    output_layer = Dense(n_classes, activation='softmax')(top_model)
    
    model = Model(inputs = conv_base.input, outputs=output_layer)
    
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model
    

In [9]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Model
#from livelossplot.inputs.keras import PlotLossesCallback

#step sizes:
steps_train = train_gen.n//train_gen.batch_size
steps_valid = valid_gen.n//valid_gen.batch_size
steps_test = test_gen.n//test_gen.batch_size


optimizer = keras.optimizers.Adam(learning_rate=0.01)
n_classes = 9

n_epochs = 50

vgg = create_model(input_shape,n_classes, optimizer, fine_tune=0)

#loss_plot 
#v1_loss_plot = PlotLossesCallback()





#model checkpoint

v1_checkpoint = ModelCheckpoint(filepath='v1_best_weights.hdf5',
                               save_best_only = True,
                               verbose = 1)


# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',
                           patience=10,
                           restore_best_weights=True,
                           mode='min')


vgg.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 277, 277, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 277, 277, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 277, 277, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 138, 138, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 138, 138, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 138, 138, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 69, 69, 128)       0     

In [None]:
model_history = vgg.fit_generator(
        generator = train_gen,
        steps_per_epoch = steps_train,
        validation_data = valid_gen,
        validation_steps = steps_valid,
        callbacks=[v1_checkpoint,early_stop],
        verbose=1,
        epochs = n_epochs
)



Epoch 1/50


In [None]:
predict = alexNet_model.predict_generator(test_gen,steps=steps_test,verbose=1)

#evaluation = model.evaluate_generator(test_gen,steps=steps_test,verbose=1)

In [None]:
from sklearn.metrics import classification_report

predicted_class = np.argmax(predict,axis=1)

print(predicted_class)

indices = train_gen.class_indices.items()

y_true = test_df['Name']

print(indices)

l = dict((v,k) for k,v in train_gen.class_indices.items())

r = dict(train_gen.class_indices.items())

prednames = [l[k] for k in predicted_class]

y_true = [r[k] for k in y_true]

filenames = test_df['Path']

finaldf = pd.DataFrame({'Filename': filenames,'Prediction': prednames})


cm = confusion_matrix(y_true,predicted_class)


print(classification_report(y_true,predicted_class))



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(cm,annot=True)

In [None]:
import matplotlib.pyplot as plt

print(model_history.history.keys())

accuracy = model_history.history['categorical_accuracy']
val_acc = model_history.history['val_categorical_accuracy']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epochs = range(len(accuracy))

plt.plot(epochs,accuracy,'r',label="Training Accuracy")
plt.plot(epochs,val_acc,'b',label="Validation Accuracy")
plt.title('Training and Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation loss')
plt.ylabel('Loss') 
plt.xlabel('Epoch')
plt.legend()
plt.show()

