In [1]:
#Import packages
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen

In [2]:
#Open the Catalog 
Catalog=pd.read_csv('catalog.csv',encoding='ansi')
Catalog.sample(10,random_state=122)

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMELINE
26186,"MANTEGNA, Andrea","(b. 1431, Isola di Carturo, d. 1506, Mantova)",The Adoration of the Magi (detail),1460-64,Tempera on wood,"Galleria degli Uffizi, Florence",https://www.wga.hu/html/m/mantegna/05/1chapel3...,painting,religious,Italian,1451-1500
22171,"ISRAELS, Isaac Lazarus","(b. 1865, Amsterdam, d. 1934, Den Haag)",In the Dance Hall,1893,"Oil on canvas, 76 x 100 cm","Rijksmuseum Kröller-Müller, Otterlo",https://www.wga.hu/html/i/israels/isaac/danceh...,painting,genre,Dutch,1851-1900
41141,SPINELLO ARETINO,"(b. ca. 1345, Arezzo, d. 1410, Arezzo)",Four Evangelists with their Symbols (vault fre...,1388,Fresco,"Sacristy, San Miniato al Monte, Florence",https://www.wga.hu/html/s/spinello/spinello/mi...,painting,religious,Italian,1351-1400
33073,"PARLER, Peter","(b. 1330, Schwabisch-Gmünd, d. 1399, Prague)",Self-Portrait,1379-86,"Sandstone, life-size","St Vitus Cathedral, Prague",https://www.wga.hu/html/p/parler/selfporx.html,sculpture,portrait,German,1351-1400
40315,"SIGNORELLI, Luca","(b. ca. 1450, Cortona, d. 1523, Cortona)",Sermon and Deeds of the Antichrist (detail),1499-1502,Fresco,"Chapel of San Brizio, Duomo, Orvieto",https://www.wga.hu/html/s/signorel/brizio/1/1a...,painting,religious,Italian,1451-1500
34471,"PISANO, Nicola",(active 1258-1278),Pulpit,1260,"Marble, height 465 cm","Baptistry, Pisa",https://www.wga.hu/html/p/pisano/nicola/1pisa_...,sculpture,religious,Italian,1251-1300
6880,"BRIATI, Giuseppe","(b. 1686, Murano, d. 1772, Murano)",Chandelier,c. 1750,Glassware,"Ca' Rezzonico, Venice",https://www.wga.hu/html/b/briati/chandeli.html,glassware,other,Italian,1701-1750
34939,"PORTA, Giacomo della","(b. ca. 1533, Genova, d. 1602, Roma)",Atlas Fountain (detail),1602,Stone and bronze,"Villa Aldobrandini, Frascati",https://www.wga.hu/html/p/porta_g/zatlasfo.html,sculpture,other,Italian,1551-1600
3690,"BANTI, Cristiano","(b. 1824, Santa Croce sull'Arno, d. 1904, Mont...",Three Peasant Women,1881,"Oil on canvas, 73 x 34 cm","Galleria Palatina (Palazzo Pitti), Florence",https://www.wga.hu/html/b/banti/3women.html,painting,genre,Italian,1851-1900
40028,"SCOREL, Jan van","(b. 1495, Schoorl, d. 1562, Utrecht)",View of Bethlehem,1520-21,"Pen and brown ink, 173 x 298 mm","British Museum, London",https://www.wga.hu/html/s/scorel/z_view.html,graphics,religious,Dutch,1501-1550


Note this database includes all kinds of arts, in 'TECHNIQUE' we can even see 'Glassware' and 'Stone and bronze'. Let's find out what techniques did Van Gogh used and then keep only those. To actually be able to train, we will only consider those techniques in which we have at least 10 samples. 

In [3]:
#Change the 'TECHNIQUE' column to ignore the size of the painting. 
Catalog=Catalog.assign(TECHNIQUE=Catalog['TECHNIQUE'].str.split(",", n = 1, expand = True)[0])

#Get the Techniques used by VanGogh
Techniques_Vangogh=Catalog.loc[Catalog.AUTHOR=='GOGH, Vincent van',['TECHNIQUE']]\
    .groupby('TECHNIQUE').agg(Count=pd.NamedAgg('TECHNIQUE','count'))\
    .reset_index()\
    .sort_values(by='Count',ascending=False)
Techniques_Vangogh.head(6)

Unnamed: 0,TECHNIQUE,Count
13,Oil on canvas,293
15,Oil on canvas on panel,18
28,Pencil,14
0,Black chalk,10
5,Black pencil,10
12,Lithograph,9


In [4]:
#Get only the ones he used more than 10 times. 
Techniques_Vangogh=list(Techniques_Vangogh.loc[Techniques_Vangogh.Count>10,'TECHNIQUE'])
print(Techniques_Vangogh)

['Oil on canvas', 'Oil on canvas on panel', 'Pencil']


We now format the Catalog to include only the information we care about. 

In [5]:
#Only techiques used by Van Gogh
Catalog=Catalog.loc[Catalog.TECHNIQUE.isin(Techniques_Vangogh),['AUTHOR','URL']]
Catalog['Vangogh']=0
Catalog.loc[Catalog.AUTHOR=='GOGH, Vincent van','Vangogh']=1
Catalog=Catalog.reset_index()

In [6]:
#Format the Catalog
Catalog['AUTHOR']=Catalog['AUTHOR'].astype('string')
Catalog['URL']=Catalog['URL'].astype('string')

Note the Catalog gives us a url for the webpage of the painting. We now need to modify it to get just the image.

In [7]:
Catalog['URL']=Catalog.URL.apply(lambda x: x.replace("/html/", "/art/").replace('.html','.jpg'))
Catalog['URL_small']=Catalog.URL.apply(lambda x: x.replace("/art/", "/detail/"))

Some Functions to get the images

In [8]:
def get_image(url):
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

We now take a subset of the Catalog, because we cant work with these many images in our laptops. We'll include all VanGogh's paintings and 1000 of others. 

In [9]:
ind_vangogh=Catalog.loc[Catalog.Vangogh==1].index
ind_others=Catalog.loc[Catalog.Vangogh==0].sample(1000).index
ind_sample=list(ind_vangogh)+list(ind_others)
Catalog_Sample=Catalog.iloc[ind_sample]
del Catalog

In [10]:
#Now we'll split this set into train and test datasets
from sklearn.model_selection import train_test_split
Catalog_train,Catalog_test=train_test_split(Catalog_Sample,stratify=Catalog_Sample.Vangogh,test_size=0.2,random_state=43202)

In [11]:
Catalog_train=Catalog_train.reset_index()
Catalog_test=Catalog_test.reset_index()

In [12]:
Catalog_train=Catalog_train[['AUTHOR','URL','Vangogh','URL_small']]
Catalog_test=Catalog_test[['AUTHOR','URL','Vangogh','URL_small']]
Catalog_train.head()

Unnamed: 0,AUTHOR,URL,Vangogh,URL_small
0,"GOGH, Vincent van",https://www.wga.hu/art/g/gogh_van/13/copies01.jpg,1,https://www.wga.hu/detail/g/gogh_van/13/copies...
1,"RUBENS, Peter Paul",https://www.wga.hu/art/r/rubens/22mythol/281my...,0,https://www.wga.hu/detail/r/rubens/22mythol/28...
2,"GOGH, Vincent van",https://www.wga.hu/art/g/gogh_van/18/2arles10.jpg,1,https://www.wga.hu/detail/g/gogh_van/18/2arles...
3,"GOGH, Vincent van",https://www.wga.hu/art/g/gogh_van/03/nuenen40.jpg,1,https://www.wga.hu/detail/g/gogh_van/03/nuenen...
4,"VERMEER, Johannes",https://www.wga.hu/art/v/vermeer/03c/25artpa5.jpg,0,https://www.wga.hu/detail/v/vermeer/03c/25artp...


In [13]:
Catalog_test['Image']=0
Catalog_test.Image=Catalog_test.Image.astype('object')
N=Catalog_test.shape[0]
All_Images=[]
for i in range(N):
    image=get_image(Catalog_test.URL_small[i])
    All_Images.append(image)
Catalog_test.loc[0:(N-1),'Image']=All_Images
Catalog_test.to_pickle('Catalog_test.pkl',protocol=3)

In [14]:
Catalog_train['Image']=0
Catalog_train.Image=Catalog_train.Image.astype('object')
N=Catalog_train.shape[0]
All_Images=[]
for i in range(N):
    image=get_image(Catalog_train.URL_small[i])
    All_Images.append(image)
Catalog_train.loc[0:(N-1),'Image']=All_Images
Catalog_train.to_pickle('Catalog_train.pkl',protocol=3)

In [None]:
c