In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [2]:
big_artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks = big_artworks.sample(frac=0.1)

In [4]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [5]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [6]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
63467,Marc Chagall,(French),(Male),(1924),Prints & Illustrated Books,1949-09-29,True,True,37.3,27.5
84137,R. Crumb,(American),(Male),(1960),Drawings,2005-05-10,False,False,27.6226,21.2725
35292,Arshile Gorky,(American),(Male),1946,Drawings,1967-10-18,True,True,46.7,61.0
118931,"Yayoi Kusama, Harry Shunk, János Kender",(Japanese) (French) (Hungarian),(Female) (Male) (Male),1968,Photography,2013-10-24,True,True,20.320041,25.400051
20099,Pierre Bonnard,(French),(Male),1900,Prints & Illustrated Books,1964-10-06,False,False,8.1,15.5


In [7]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [8]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [9]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat artists with other variables
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [10]:
# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [11]:
mlp.score(X, Y)

0.59849653045489593

In [12]:
Y.value_counts()/len(Y)

Prints & Illustrated Books    0.527274
Photography                   0.225520
Architecture & Design         0.117001
Drawings                      0.096473
Painting & Sculpture          0.033732
Name: Department, dtype: float64

In [13]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.27491574,  0.32192771,  0.32289157,  0.53879518,  0.3997107 ])

# Assignment
Experiment with different hidden layer structures. Try this on a subset of the data to improve runtime. Feel free to manipulate other parameters as well.

In [14]:
# Version 1 - two layers
mlp = MLPClassifier(hidden_layer_sizes=(500,4,),max_iter=400)
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [15]:
mlp.score(X, Y)

0.52727447956823437

In [16]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.5272027 ,  0.52722892,  0.52722892,  0.52722892,  0.52748312])

In [17]:
# Version 2 - two layers
mlp = MLPClassifier(hidden_layer_sizes=(250,8,),max_iter=400)
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(250, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [18]:
mlp.score(X, Y)

0.52727447956823437

In [19]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.53635051,  0.52722892,  0.52722892,  0.52722892,  0.53953713])

In [20]:
# Version 3 - three layers
mlp = MLPClassifier(hidden_layer_sizes=(500,8,4,),max_iter=400)
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 8, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [21]:
mlp.score(X, Y)

0.52737085582112564

In [22]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.5272027 ,  0.52722892,  0.52722892,  0.52722892,  0.52748312])

In [23]:
# Version 4 - activation = 'logistic'
mlp = MLPClassifier(hidden_layer_sizes=(1000,),activation='logistic',max_iter=400)
mlp.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [24]:
mlp.score(X, Y)

0.61661526599845795

In [25]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.64419836,  0.62939759,  0.6260241 ,  0.55084337,  0.62391514])

In [26]:
# Version 5 - alpha = 0.001
mlp = MLPClassifier(hidden_layer_sizes=(1000,),alpha=0.001,max_iter=400)
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [27]:
mlp.score(X, Y)

0.51975713184271399

In [28]:
cross_val_score(mlp, X, Y, cv=5)

array([ 0.53972075,  0.26554217,  0.22843373,  0.39084337,  0.28399229])