In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

## Data Preprocessing 

In [4]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [5]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [6]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

# Drill: Playing with layers

- Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

In [7]:
# Import the model.
from sklearn.neural_network import MLPClassifier

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

## Changing Hidden Layer Size

In [10]:
mlp_hls = MLPClassifier(hidden_layer_sizes=(30,))

mlp_hls.fit(X_train, Y_train)

print(mlp_hls.score(X_train, Y_train))

print(Y.value_counts()/len(Y))

print(cross_val_score(mlp_hls, X_test, Y_test, cv=3))

0.7473040933515315
Drawings & Prints        0.622496
Photography              0.226428
Architecture & Design    0.113016
Painting & Sculpture     0.033688
Media and Performance    0.004371
Name: Department, dtype: float64
[0.72991812 0.54448871 0.55096846]


## Changing Number of Layers

In [11]:
mlp_n = MLPClassifier(hidden_layer_sizes=(30,30,30))
mlp_n.fit(X_train, Y_train)
mlp_n.score(X_train, Y_train)
print(Y.value_counts()/len(Y))
cross_val_score(mlp_n, X_test, Y_test, cv=3)

Drawings & Prints        0.622496
Photography              0.226428
Architecture & Design    0.113016
Painting & Sculpture     0.033688
Media and Performance    0.004371
Name: Department, dtype: float64


array([0.74087187, 0.74413457, 0.71798561])

## Changing Alpha

In [12]:
mlp_a = MLPClassifier(hidden_layer_sizes=(30,),alpha=0.001)
mlp_a.fit(X_train, Y_train)
mlp_a.score(X_train, Y_train)
Y.value_counts()/len(Y)
cross_val_score(mlp_a, X_test, Y_test, cv=5)

array([0.67434999, 0.64281763, 0.72666913, 0.70798746, 0.71642066])

## Changing Activation Function

In [13]:
mlp_af = MLPClassifier(hidden_layer_sizes=(30,),activation='logistic')
mlp_af.fit(X_train, Y_train)
mlp_af.score(X_train, Y_train)
Y.value_counts()/len(Y)
cross_val_score(mlp_af, X_test, Y_test, cv=5)



array([0.74073391, 0.74515951, 0.73957949, 0.74063826, 0.74132841])