In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Artworks.csv')

In [5]:
# Select Columns.
df = df[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
df['URL'] = df['URL'].notnull()
df['ThumbnailURL'] = df['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
df = df[df['Department']!='Film']
df = df[df['Department']!='Media and Performance Art']
df = df[df['Department']!='Fluxus Collection']

# Drop missing data.
df = df.dropna()

In [6]:
df.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [8]:
df['DateAcquired'] = pd.to_datetime(df.DateAcquired)
df['YearAcquired'] = df.DateAcquired.dt.year

In [9]:
# Remove multiple nationalities, genders, and artists.
df.loc[df['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
df.loc[df['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
df.loc[df['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
df['Date'] = pd.Series(df.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = df.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(df.Artist)
nationalities = pd.get_dummies(df.Nationality)
dates = pd.get_dummies(df.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = df.Department

# Hidden_layer_sizes=(5, )

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=123)
mlp = MLPClassifier(hidden_layer_sizes=(5,))
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
print('training set score: {}'.format(mlp.score(X_train, Y_train)))
print('test set score: {}'.format(mlp.score(X_test, Y_test)))

training set score: 0.7023738395073077
test set score: 0.6991589687026059


In [13]:
from sklearn.model_selection import cross_val_score

cross_val_score(mlp, X, Y, cv=5)

array([0.68863459, 0.70609863, 0.62277678, 0.62390845, 0.55694981])

# Hidden_layer_sizes=(3, )

In [23]:
mlp = MLPClassifier(hidden_layer_sizes=(3,))
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(3,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [24]:
print('training set score: {}'.format(mlp.score(X_train, Y_train)))
print('test set score: {}'.format(mlp.score(X_test, Y_test)))

training set score: 0.6227479547752551
test set score: 0.6230065719931982


In [25]:
cross_val_score(mlp, X_train, Y_train, cv=5)

array([0.62270221, 0.62270221, 0.62277376, 0.62277376, 0.62278786])

In [26]:
mlp = MLPClassifier(hidden_layer_sizes=(2,))
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(2,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [29]:
print('training set score: {}'.format(mlp.score(X_train, Y_train)))
print('test set score: {}'.format(mlp.score(X_test, Y_test)))

training set score: 0.6397531942274106
test set score: 0.6379429201709638


The lesser the amount of hidden layers, the faster the model runs.