In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
import seaborn

In [2]:
artists = pd.read_csv("artists.csv")
artworks = pd.read_csv("artworks.csv")

In [3]:
artworks = artworks.set_index(artworks['Artwork ID'])
artists = artists.set_index(artists['Artist ID'])

In [4]:
def find_year(datestr):
    if datestr is np.nan:
        return None
    match = re.search('[1-2]\d\d\d', datestr)
    if match is None:
        return -1
    else:
        return int(match.group())

In [5]:
artworks['Year'] = artworks['Date'].apply(find_year)

In [6]:
workcount_arr = np.zeros(artists['Artist ID'].max()+1, dtype=int)
for i, s in artworks['Artist ID'].iteritems():
    if s is np.nan:
        continue
    for j in [(x.strip()) for x in s.split(',')]:
        if j == '': continue
        try:
            workcount_arr[int(j)] += 1
        except ValueError:
            print 'Invalid id %s' % s
workcount = pd.Series(workcount_arr, name='Work Count')
artists = artists.join(workcount, on='Artist ID')

In [7]:
collaboration = []
for i, s in artworks['Artist ID'].iteritems():
    if s is np.nan or s == '':
        continue
    if len(s.split(',')) > 1:
        collaboration.append([int(x.strip()) for x in s.split(',')])

In [8]:
len(collaboration), len(artists)

(6429, 15091)

In [9]:
collab = []
collab_id = []
weight = []
for ind, r in artworks.iterrows():
    s = r['Artist ID']
    if s is np.nan or len(s.split(',')) <= 1:
        continue
    collaborators = np.array([int(x.strip()) for x in s.split(',')])
    collaborators = np.unique(collaborators)
    # use work count as proxy for importance
    w = artists['Work Count'][collaborators].sum()
    for i in range(len(collaborators)):
        for j in range(i+1, len(collaborators)):
            collab.append(set((collaborators[i],collaborators[j])))
            collab_id.append(r['Artwork ID'])
            weight.append(w)

In [10]:
len(collab)

45397

In [11]:
artist1 = np.array(map(lambda x: list(x)[0], collab))
artist2 = np.array(map(lambda x: list(x)[1], collab))
edges = pd.DataFrame(np.hstack([artist1[:,np.newaxis], artist2[:,np.newaxis], 
                     np.array(collab_id)[:,np.newaxis],
                     np.array(weight)[:,np.newaxis]]),
                     columns = ['Source', 'Target', 'Artwork ID', 'Weight'])

In [12]:
edges = edges.join(artworks['Title'], on=['Artwork ID'])

In [13]:
edges[['Source', 'Target', 'Weight', 'Title']].to_csv('collab_edges.csv', encoding='utf-8')

In [14]:
artists[['Name', 'Birth Year', 'Work Count']].to_csv('collab_nodes.csv', encoding='utf-8')

In [15]:
from sklearn.feature_extraction import DictVectorizer

In [16]:
artists_features = artists[['Nationality', 'Gender', 'Birth Year', 'Death Year']]

In [17]:
featuredicts = [dict(r) for i,r in artists_features.dropna().iterrows()]

In [18]:
vec = DictVectorizer()
featurearray = vec.fit_transform(featuredicts).toarray()

In [19]:
from sklearn.cluster import KMeans

In [20]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(featurearray)

In [21]:
data = np.vstack((artists_features.dropna().index, kmeans.labels_)).transpose()
artists_cluster = pd.DataFrame(data, columns = ['Artist ID', 'Cluster']).set_index('Artist ID')

In [24]:
artists = artists.join(artists_cluster, on='Artist ID')

In [25]:
artists.groupby('Cluster').median().sort_values('Work Count', ascending=False)

Unnamed: 0_level_0,Artist ID,Birth Year,Death Year,Work Count
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16.0,2274.0,1746.0,1828.0,181.0
19.0,5543.0,1886.0,1941.0,3.0
12.0,4963.0,1838.0,1907.0,3.0
4.0,4669.0,1900.0,1985.0,3.0
7.0,3907.0,1927.0,1974.0,3.0
8.0,5189.0,1936.0,2011.0,3.0
15.0,4590.0,1922.0,2007.0,3.0
5.0,4687.0,1909.0,1999.0,2.5
18.0,29857.0,1955.0,2002.0,2.0
17.0,5791.0,1804.0,1870.0,2.0
