## Drill: Playing with layers

Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

#### Notes:
- Use the same row selection as original feature selection
- Use  cleaning / processing as from original 

Additional feature processing

- remove all samples without complete data.  
- reduce range of of data (year) features by transforming to years before present year
- BeginDate is actually when artist was born.  EndDate is when artist died.  (0) if still alive.  
- only use one nationality
- fit mlp classifier with and without artist feature.  

In [88]:
import numpy  as np
import pandas as pd
import re
pd.options.display.float_format = '{:.2f}'.format

In [8]:
art = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
art = art[['Artist', 'Nationality', 'Gender', 'Date', 'Department','DateAcquired', 
                     'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)', 'BeginDate', 'EndDate']]
#cleanup the column labels
art.columns = [col.lower() for col in art.columns]
art.columns = ['artt', 'nlty', 'gndr', 'date', 'dpmt', 'Acquired','url', 'thml',
               'hght', 'wdth', 'born', 'dead']

In [9]:
art.tail(2)

Unnamed: 0,artt,nlty,gndr,date,dpmt,Acquired,url,thml,hght,wdth,born,dead
138372,Adrián Villar Rojas,(Argentine),(Male),2012,Drawings & Prints,2013-05-23,,,24.0,32.0,(1980),(0)
138373,Adrián Villar Rojas,(Argentine),(Male),2012,Drawings & Prints,2013-05-23,,,24.0,32.0,(1980),(0)


In [10]:
# Convert URL's to booleans.
art.url  = np.where(art.url.isna(),  0, 1)
art.thml = np.where(art.thml.isna(), 0, 1)
# Drop films and some other tricky rows.
art = art[art['dpmt']!='Film']
art = art[art['dpmt']!='Media and Performance Art']
art = art[art['dpmt']!='Fluxus Collection']
# Drop missing data.
art = art.dropna()
art.index = range(len(art))
art['acqd'] = pd.to_datetime(art.Acquired).dt.year

In [12]:
df = art.drop(['Acquired'], axis=1)

new cleaning, features from here down

In [13]:
# find rows with missing data to drop
A = [i for i,val in enumerate(df.gndr) if all([val != '(Male)', val != '(Female)'])]
B = [i for i,val in enumerate(df.born) if val == '()']
C = [i for i,val in enumerate(df.date) if val.lower().count('unknown')]
D = [i for i,val in enumerate(df.nlty) if val.lower().count('unknown')]
E = list(np.where(df.nlty.str.count(r"(\w+\s*\w+)") != 1)[0])
drp_row = []
for c in 'ABCDE':
    drp_row.extend(eval(c)); drp_row  = list(set(drp_row ))   
df = df.drop(drp_row, axis=0)
df.index = range(len(df))
# get single value for features
df.date = df.date.str.extract(r"(\d{4,4})").astype(float)
df.nlty = df.nlty.str.extract(r"(\w+\s*\w+)")
df.born = df.born.str.replace(r"\D+", '').astype(int)
df.dead = df.dead.str.replace(r"\D+", '').astype(int)
# features
df['male'] = np.where(df.gndr == '(Male)', 1, 0)
df['dead'] = np.where(df.dead == 0,        0, 1)
# reduce range of feature
df.date = df.date.apply(lambda x: 2019 - x)
df.born = df.born.apply(lambda x: 2019 - x)
df.acqd = df.acqd.apply(lambda x: 2019 - x)

df = df.dropna()
df.index = range(len(df))

In [14]:
# still 96k+ samples
df.tail(2)

Unnamed: 0,artt,nlty,gndr,date,dpmt,url,thml,hght,wdth,born,dead,acqd,male
96780,Adrián Villar Rojas,Argentine,(Male),7.0,Drawings & Prints,0,0,24.0,32.0,39,0,6,1
96781,Adrián Villar Rojas,Argentine,(Male),7.0,Drawings & Prints,0,0,24.0,32.0,39,0,6,1


In [15]:
assert df.isna().any().any() == False

### Pre-Process


In [17]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, Binarizer

In [27]:
# target labels are so long, had to change them
dct = {unq : re.match(r"(\w{3,5})", unq)[0].lower() for unq in df.dpmt.unique()}
y = df.dpmt.apply(lambda x: dct[x])

In [None]:
# with artist feature
#df1 = df.drop(['gndr', 'dpmt'], axis=1)
#fte_cat = ['artt', 'nlty']

In [39]:
# without artist feature
df1 = df.drop(['gndr', 'dpmt', 'artt'], axis=1)
fte_cat = ['nlty']
fte_bny = ['url', 'thml', 'dead', 'male']
fte_cts = ['date', 'hght', 'wdth', 'born', 'acqd']
fte = fte_cat + fte_bny + fte_cts

In [40]:
# Features classified as continous, binary and categorical for preprocessing
tfr_cts = Pipeline(steps=[('siz', SimpleImputer(strategy='constant', fill_value=0)),
                          ('mms', MinMaxScaler())])
tfr_bny = Pipeline(steps=[('siz', SimpleImputer(strategy='constant', fill_value=0)),
                          ('bnr', Binarizer())])
tfr_cat = Pipeline(steps=[('sin', SimpleImputer(strategy='constant', fill_value='na')),
                          ('ohe', OneHotEncoder(categories='auto'))])

ppr = ColumnTransformer(transformers=[('cts', tfr_cts, fte_cts), ('bny', tfr_bny, fte_bny), ('cat', tfr_cat, fte_cat)])


In [41]:
assert len(fte) == len(df1.columns)

In [42]:
X = ppr.fit_transform(df1)

### Multi-layer Perceptron Model

In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [99]:
mlp = MLPClassifier(max_iter=300 )#hidden_layer_sizes=(100,)
mlp.fit(X, y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

Multi-layer Perceptron Classifier

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

- score = 0.8302 without artist feature
- score = 0.9737 with artist feature 

With artist feature was kinda silly as there was almost no variance between artist and department.  

In [100]:
mlp.score(X,y)

0.8301956975470646

In [98]:
# probability by sample by target (percent)
df_prb = pd.DataFrame(mlp.predict_proba( X), columns=mlp.classes_).mul(100)
df_prb['max'] = df_prb.max(axis=1)
df_prb.head(9)

Unnamed: 0,archi,drawi,media,paint,photo,max
0,96.62,1.38,0.0,0.27,1.72,96.62
1,13.14,81.07,0.0,0.3,5.5,81.07
2,58.19,18.49,0.0,0.19,23.13,58.19
3,56.02,22.29,0.0,0.19,21.51,56.02
4,12.52,34.06,0.0,52.91,0.5,52.91
5,66.78,15.4,0.0,0.22,17.6,66.78
6,49.48,24.4,0.0,0.18,25.94,49.48
7,68.33,10.58,0.0,0.11,20.99,68.33
8,2.53,88.64,0.0,1.02,7.8,88.64


#### Scratch

snippets not needed, but not deleted

In [None]:
df.isna().any()

In [None]:
df.nationality.unique()

In [None]:
df.tail(2)

In [None]:
df.dtypes

In [None]:
rx1 = re.compile(r"Unknown+", re.IGNORECASE)     # word Unknown
rx2 = re.compile(r"(\d+)")    #  4 digits only group
rx3 = re.compile(r"(\w+)")         # matches at least 1 alphanumeric group
rx4 = re.compile(r"(\w+\s*\w+)") 

In [None]:
m = rx2.findall(t)
bool(m[0]) and bool(m[2])

In [None]:
row_drp = list(np.where(art.date == 'Unknown')[0])
df = art.drop(row_drp, axis=0)
df.index = range(len(df))

In [None]:
row_drp = list(np.where(df.artist.str.contains("Unknown"))[0])
df = df.drop(row_drp, axis=0)
df.index = range(len(df))

In [None]:
row_drp = list(np.where(df.gender == "()")[0])
df = df.drop(row_drp, axis=0)
df.index = range(len(df))

In [None]:
series = [df[col].apply(lambda x: re.findall(r'(\w+)', x)) for col in ['nationality', 'gender', 'born', 'living']]
assert series[0].shape == series[1].shape  
len(series[0])
    

In [None]:
for S in series:
    for i,s in enumerate(S):
        if not len(s):                    S[i] = 'unknown'
        elif len(s) == 1:                 S[i] = s[0]
        elif all([x == s[0] for x in s]): S[i] = s[0]
        else:                             S[i] = 'multiple'

In [None]:
for i in range(len(df)): 
    check = [x for x in df.iat[i,2]]
    if len(check) == 1: df.iat[i,2] = df.iat[i,2][0].lower()
    elif 'Male' in check and 'Female' in check: art.gender[i] = 'mixed'
    else:               df.iat[i,2] = df.iat[i,2][0].lower()

In [None]:

for i in range(len(df)):
    if len(df.iat[i,1]) == 1: df.iat[i,1] = df.iat[i,1][0]
    else: df.iat[i,1] == 'mixed'

In [None]:
for (i,val) in enumerate(art.gender):
    if len(val) == 1: art.gender[i] =art.gender[i][0].lower()
    elif 'Male' in art.gender[i] and 'Female' in art.gender[i]: art.gender[i] = 'mixed'
    else:             art.gender[i] =art.gender[i][0].lower()