In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [4]:
artworks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138374 entries, 0 to 138373
Data columns (total 29 columns):
Title                 138335 non-null object
Artist                136897 non-null object
ConstituentID         136897 non-null object
ArtistBio             132833 non-null object
Nationality           136897 non-null object
BeginDate             136897 non-null object
EndDate               136897 non-null object
Gender                136897 non-null object
Date                  135982 non-null object
Medium                127381 non-null object
Dimensions            127380 non-null object
CreditLine            135528 non-null object
AccessionNumber       138374 non-null object
Classification        138374 non-null object
Department            138374 non-null object
DateAcquired          131606 non-null object
Cataloged             138374 non-null object
ObjectID              138374 non-null int64
URL                   79563 non-null object
ThumbnailURL          69048 non-null

In [5]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [6]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [7]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [8]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [9]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [26]:
# Import the model.
from sklearn.neural_network import MLPClassifier


def MLPclassify(x,param):
    start_time = time.time()

    # Alright! We've done our prep, let's build the model.
    # Neural networks are hugely computationally intensive.
    # This may take several minutes to run.
  
    # Establish and fit the model, with a perceptron layers.
    mlp = MLPClassifier(param)
    mlp.fit(X, Y)

    print("--- {} seconds ---".format(round(time.time() - start_time, 2)))
    return mlp.score(X, Y)

In [11]:
score1 = MLPclassify(X, 10, 5)
print(score1)

--- 122.66 seconds ---
0.7635429825451125


In [12]:
score2 = MLPclassify(X, 100, 5)
print(score2)

--- 28.64 seconds ---
0.6225484320107698


In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
score3 = MLPclassify(X, 10, 5)
print(score3)

--- 127.7 seconds ---
0.7677107633862297


In [18]:
score4 = MLPclassify(X, 30,30,30)
print(score4)



--- 339.34 seconds ---
0.7942480936090953


In [27]:
score5 = MLPclassify(X, 10,10,5)
print(score5)

--- 81.58 seconds ---
0.725627241795834


In [42]:
start_time = time.time()
# Establish and fit the model, with a perceptron layers.
mlp = MLPClassifier(hidden_layer_sizes=[100,10,5], activation='identity')
mlp.fit(X, Y)

print("--- {} seconds ---".format(round(time.time() - start_time, 2)))
print(mlp.score(X, Y))

--- 86.38 seconds ---
0.7146822067108648


In [43]:
start_time = time.time()
# Establish and fit the model, with a perceptron layers.
mlp = MLPClassifier(hidden_layer_sizes=[100,10,5], activation='logistic')
mlp.fit(X, Y)

print("--- {} seconds ---".format(round(time.time() - start_time, 2)))
print(mlp.score(X, Y))



--- 229.84 seconds ---
0.7772265815898424


In [44]:
start_time = time.time()
# Establish and fit the model, with a perceptron layers.
mlp = MLPClassifier(hidden_layer_sizes=[100,10,10,5])
mlp.fit(X, Y)

print("--- {} seconds ---".format(round(time.time() - start_time, 2)))
print(mlp.score(X, Y))

--- 372.69 seconds ---
0.7259776304506182


In [46]:
start_time = time.time()
# Establish and fit the model, with a perceptron layers.
mlp = MLPClassifier(hidden_layer_sizes=[30,30,10,5], max_iter=500)
mlp.fit(X, Y)

print("--- {} seconds ---".format(round(time.time() - start_time, 2)))
print(mlp.score(X, Y))

--- 311.09 seconds ---
0.7943310803957547
