# Data Preprocessing for Algorithm Design
## Art Museum

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

### Read the data

In [2]:
collections = pd.read_csv('data/collections.csv', sep=",", encoding="utf-8")
collections.head()

Unnamed: 0,CollectionID,CollectionName,CollDescription,NumObjects
0,1,Textiles and Furniture ...,,0
1,2,Southeast Asia ...,,264
2,3,South Arabia,,62
3,4,Roman Empire ...,<p>The Walters&rsquo; collection contains one ...,436
4,5,Renaissance Europe ...,<p>The Walters' collection of European Renaiss...,1021


In [3]:
exhibitions = pd.read_csv('data/exhibitions.csv', sep=",", encoding="utf-8")
exhibitions.head()

Unnamed: 0,ExhibitionID,ExhTitle,NonWAMExhibition,ExhibitionDisplayDate,ExhibBeginDate,ExhibEndDate,ExhibBeginYear,ExhibEndYear,Textblock,LocationID,WaltersArtworks
0,3628,Léon Bonvin (1834-1866),True,2022-2023,,,2022.0,2023.0,"Léon Bonvin (1834-1866). Fondation Custodia, P...",,"13074, 13074, 13074, 13074, 13074, 13074, 1307..."
1,3696,"Sargent, Whistler, and Venetian Glass: America...",True,2021-2023,,,2021.0,2023.0,"Sargent, Whistler, and Venetian Glass: America...",,"26856, 26856, 26856, 6053, 6053, 6053, 19004, ..."
2,3735,Hugo van der Goes,True,2021-2023,,,2021.0,2023.0,"Hugo van der Goes. Gemaldegalerie, Berlin. 202...",,26514
3,3786,Quiet Beauty: The Watercolors of Léon Bonvin,False,2023,,,2023.0,2023.0,Quiet Beauty: The Watercolors of Léon Bonvin. ...,,"13987, 13987, 13987, 13987, 17483, 17483, 1748..."
4,3581,"Caravans of Gold, Fragments in Time: Art, Cult...",True,2019-2022,,,2019.0,2022.0,"Caravans of Gold, Fragments in Time: Art, Cult...",,"21724, 21724, 21724, 21724, 21724, 21724, 2055..."


In [4]:
creators = pd.read_csv('data/creators.csv', sep=",", encoding="utf-8")
creators.head()

Unnamed: 0,id,sort_name,name,CreatorURL,gender,biography,date,CreatorArt
0,6741,"'Abbasi, Riza",Riza 'Abbasi,https://art.thewalters.org/browse/creator/riza...,,,"(Persian, ca. 1565-1635)","37361, 34627, 33104, 26833, 25935, 9485, 4103,..."
1,4237,"'Abbasi, Shaykh",Shaykh 'Abbasi,https://art.thewalters.org/browse/creator/shay...,,,"(Persian, active 1650-1684)","33104, 25935, 19347, 987, 7066, 84294, 84260"
2,19270,"'Abd al-Razzaq, Abu Mansur Muhammad ibn",Abu Mansur Muhammad ibn 'Abd al-Razzaq,https://art.thewalters.org/browse/creator/abu-...,,,,"7066, 83514"
3,19395,'Ali,'Ali,https://art.thewalters.org/browse/creator/ali,,,,"19840, 90954, 90955"
4,16856,'Ali Sultan,Sultan 'Ali,https://art.thewalters.org/browse/creator/sult...,,,,"82840, 80051, 11787, 90218"


In [5]:
artworks = pd.read_csv('data/art.csv', sep=",", encoding="utf-8")
artworks.head()

  artworks = pd.read_csv('data/art.csv', sep=",", encoding="utf-8")


Unnamed: 0,ObjectID,ObjectNumber,SortNumber,ObjectName,DateBeginYear,DateEndYear,DateText,Title,Dimensions,Medium,...,LatitudeNumber,LongitudeNumber,RelatedObjects,Images,CollectionID,CollectionName,MuseumLocation,LocationString,Creators,Exhibitions
0,7,54.975,54.975,statuettes (statues),-425,-400,late 5th century BCE (Classical),Zeus,H: 13 7/8 in. (35.3 cm),bronze,...,,,,"PS1_54.975_Back_DD_T08.jpg, PS1_54.975_Fnt_DD_...",ROM,Ancient Art,To find out whether this object is currently o...,,"6291, 3705",
1,11,42.188,42.188,finger rings; beads; plaques (flat objects),-1479,-1185,"1479-1185 BCE (New Kingdom, Dynasty 18-19)",Unfinished Ring with Flat Bead,H: 3/8 x W: 1/2 (0.99 x 1.33 x 0.04 cm); Ring ...,carved yellow jasper and gold,...,,,,"CUR_42.188_SideA_DD_RS2009.jpg, CUR_42.188_Top...","EGY, JWL",Ancient Art,To find out whether this object is currently o...,,6182,2513
2,14,42.86,42.86,beads (pierced objects); spacers,-1388,-1350,"1388-1350 BCE (New Kingdom, Dynasty 18)",Spacer with Cartouche of Amenophis III,H: 7/8 x W: 1/4 x D: 1/16 in. (2.22 x 0.71 x 0...,Egyptian faience with blue and white glaze,...,,,,"CUR_42.86_SideA_DD_RS2009.jpg, CUR_42.86_Rev_D...","EGY, JWL",Ancient Art,To find out whether this object is currently o...,,6182,2513
3,22,48.1367,48.1367,plates,1513,1537,ca. 1525-1530 (Renaissance),Dish with Foliage Design,1 7/8 x 8 9/16 in. (4.7 x 21.7 cm),earthenware with tin glaze (maiolica) and lust...,...,,,,"PL9_48.1367_Fnt_SL.jpg, PL2_48.1367_Back_BW.jpg",REN,Renaissance and Baroque Art,To find out whether this object is currently o...,,33562,
4,35,W.863,W.863,miniatures (paintings),1775,1800,1775-1800,Krishna Steals the Gopis' Clothing,H: 9 13/16 x W: 7 5/16 in. (25 x 18.5 cm); Ima...,opaque watercolor and gold paint on paper,...,,,,PS1_W.863_Fnt_DD_T10.jpg,INT,South and Southeast Asian Art,To find out whether this object is currently o...,,2191,"1983, 2071"


### Select relevant features

In [6]:
# Selected features
collection_features =['CollectionID', 'CollectionName', 'CollDescription', 'NumObjects']
museum_collection = collections[collection_features]
museum_collection.columns

Index(['CollectionID', 'CollectionName', 'CollDescription', 'NumObjects'], dtype='object')

In [7]:
# Selected features 
# these features ('LocationID', 'WaltersArtworks') are foreign keys and should be included in another table for relations
exhibition_features =['ExhibitionID', 'ExhTitle', 
       'ExhibBeginYear', 'ExhibEndYear']
museum_exhibition = exhibitions[exhibition_features]
museum_exhibition.columns

Index(['ExhibitionID', 'ExhTitle', 'ExhibBeginYear', 'ExhibEndYear'], dtype='object')

In [8]:
# Selected features 
# this feature ('CreatorArt') is a foreign key and should be included in another table for relations
creator_features =['id', 'name', 'CreatorURL', 'gender', 'date']
museum_creator = creators[creator_features]
museum_creator.columns

Index(['id', 'name', 'CreatorURL', 'gender', 'date'], dtype='object')

In [9]:
# Selected features 
# these features ('CollectionID', 'Creators', 'Exhibitions') are foreign keys and should be included in another table for relations
art_features =['ObjectID', 'ObjectName', 'DateBeginYear',
       'DateEndYear', 'Title', 'Medium', 
       'ResourceURL', 'Description', 'Images']
museum_artwork = artworks[art_features]
museum_artwork.columns

Index(['ObjectID', 'ObjectName', 'DateBeginYear', 'DateEndYear', 'Title',
       'Medium', 'ResourceURL', 'Description', 'Images'],
      dtype='object')