In [9]:
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import time
from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [10]:
time_start = dt.now()
print('start time {}'.format(time_start))
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed for read_csv is {}'.format(time_end - time_start))

artworks.columns

start time 2019-03-18 08:06:44.017013
end time 2019-03-18 08:06:53.726568
Total time passed for read_csv is 0:00:09.709555


Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [11]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [12]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

## Neural Network with two layers

In [13]:
time_start = dt.now()
print('Running MLPClassifier two layers (100,4)')
print('start time {}'.format(time_start))

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,4))
mlp.fit(X, Y)

time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed is {}'.format(time_end - time_start))

Running MLPClassifier two layers (100,4)
start time 2019-03-18 08:06:57.411779
end time 2019-03-18 08:08:02.536504
Total time passed is 0:01:05.124725


In [14]:
mlp.score(X, Y)

0.621883165360209

In [15]:
Y.value_counts()/len(Y)

Drawings & Prints        0.621874
Photography              0.227417
Architecture & Design    0.113177
Painting & Sculpture     0.033893
Media and Performance    0.003639
Name: Department, dtype: float64

In [16]:
cross_val_score(mlp, X, Y, cv=5)

array([0.62184482, 0.62184482, 0.62187383, 0.62190285, 0.62190285])

## Neural Network with three layers

In [17]:
time_start = dt.now()
print('Running MLPClassifier two layers (60,6,2)')
print('start time {}'.format(time_start))

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(60,6,2))
mlp.fit(X, Y)

time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed is {}'.format(time_end - time_start))

Running MLPClassifier two layers (60,6,2)
start time 2019-03-18 08:12:34.880081
end time 2019-03-18 08:13:37.185645
Total time passed is 0:01:02.305564


In [18]:
mlp.score(X, Y)

0.6218738335199702

In [19]:
Y.value_counts()/len(Y)

Drawings & Prints        0.621874
Photography              0.227417
Architecture & Design    0.113177
Painting & Sculpture     0.033893
Media and Performance    0.003639
Name: Department, dtype: float64

In [None]:
time_start = dt.now()
print('Running cross_val_score')
print('start time {}'.format(time_start))
cross_val_score(mlp, X, Y, cv=5)
time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed is {}'.format(time_end - time_start))

array([0.62184482, 0.62184482, 0.62187383, 0.62190285, 0.62190285])

## Neural Network with one layer

In [None]:
time_start = dt.now()
print('Running MLPClassifier two layers (60,6,2)')
print('start time {}'.format(time_start))

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed is {}'.format(time_end - time_start))

Running MLPClassifier two layers (60,6,2)
start time 2019-03-18 08:17:14.960101


In [None]:
mlp.score(X, Y)

In [None]:
Y.value_counts()/len(Y)

In [None]:
time_start = dt.now()
print('Running MLPClassifier two layers (60,6,2)')
print('start time {}'.format(time_start))

cross_val_score(mlp, X, Y, cv=5)

time_end = dt.now()
print('end time {}'.format(time_end))
print('Total time passed is {}'.format(time_end - time_start))