Danish art 'motivs' from Wikidata
==================

'Motivs' (depictions) in Danish artworks present in Wikidata.

In [1]:
import pandas as pd
import sparql
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
# Formulation of a query to Wikidata 
service = sparql.Service("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
statement = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/> 
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX v: <http://www.wikidata.org/prop/statement/>

SELECT ?artwork ?artworkLabel ?motiv ?motivLabel ?filename WHERE {
   ?artwork wdt:P31 wd:Q3305213 .
   ?artwork wdt:P180 ?motiv .        
   ?artwork wdt:P195 ?collection .
   ?collection wdt:P17 wd:Q35 .

   OPTIONAL {?artwork wdt:P18 ?filename } 

   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "da,en" .
   } 
 }
"""

In [3]:
# Querying Wikidata and formatting it for a DataFrame
result = service.query(statement)
df = pd.DataFrame(result.fetchall(), columns=result.variables)

In [4]:
df.shape

(1407, 5)

In [5]:
# Show a bit of the download data
df.head()

Unnamed: 0,artwork,artworkLabel,motiv,motivLabel,filename
0,http://www.wikidata.org/entity/Q2015484,Et selskab af danske kunstnere i Rom,http://www.wikidata.org/entity/Q144,hund,http://commons.wikimedia.org/wiki/Special:File...
1,http://www.wikidata.org/entity/Q2015484,Et selskab af danske kunstnere i Rom,http://www.wikidata.org/entity/Q14748,bord,http://commons.wikimedia.org/wiki/Special:File...
2,http://www.wikidata.org/entity/Q2015484,Et selskab af danske kunstnere i Rom,http://www.wikidata.org/entity/Q15026,stol,http://commons.wikimedia.org/wiki/Special:File...
3,http://www.wikidata.org/entity/Q2015484,Et selskab af danske kunstnere i Rom,http://www.wikidata.org/entity/Q80151,hat,http://commons.wikimedia.org/wiki/Special:File...
4,http://www.wikidata.org/entity/Q2015484,Et selskab af danske kunstnere i Rom,http://www.wikidata.org/entity/Q358399,Wilhelm Marstrand,http://commons.wikimedia.org/wiki/Special:File...


In [6]:
# Set up feature matrix
motivs = set([item.value for item in df['motivLabel']])
artworks = set([item.value for item in df['artwork']])
feature_matrix = pd.DataFrame(0, index=artworks, columns=motivs)
artwork_mapper = {}
filename_mapper = {}
for n, row in df.iterrows():
    artwork = row['artwork'].value
    motiv = row['motivLabel'].value
    feature_matrix.ix[artwork, motiv] = 1 
    artwork_mapper[artwork] = row['artworkLabel'].value
    filename_mapper[artwork] = str(row['filename'])[51:]

In [7]:
feature_matrix.shape

(297, 578)

In [8]:
# Show a bit of the constructed feature matrix
feature_matrix.head()

Unnamed: 0,papegøje,Johan Nicolai Madvig,HMS Shannon,"mandolinist, mandolinspiller",Mursten,Adolf 8. af Holsten,ørering,"kristne kors, kristent kors",Arresø,skøjte,...,Borgund stavkirke,Porre,chibouk,Læsø,København,Sankthans,Leg,foredrag,Carl Ludvig Bendz,"Baby, baby"
http://www.wikidata.org/entity/Q21030364,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
http://www.wikidata.org/entity/Q17826909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
http://www.wikidata.org/entity/Q20440827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
http://www.wikidata.org/entity/Q20276869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
http://www.wikidata.org/entity/Q19935962,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Some arbitrary scaling - more research needed here
scaled_feature_matrix = feature_matrix.copy()
scaled_feature_matrix = scaled_feature_matrix.divide((feature_matrix.sum(axis=1)) ** 0.9, axis='index')
scaled_feature_matrix = scaled_feature_matrix.divide((feature_matrix.sum(axis=0)) ** 0.05, axis='columns')

In [10]:
# Machine learning decomposition with non-negative matrix factorization
from sklearn.decomposition import NMF
decomposer = NMF(n_components=12)
decomposer.fit(scaled_feature_matrix)
transformed = decomposer.transform(scaled_feature_matrix)

In [11]:
# Show the results
for topic_id in range(decomposer.components_.shape[0]):
    indices = (-decomposer.components_[topic_id, :]).argsort()[:10]
    s = ""
    for index in indices:
        s += "%s (%f) " %  (scaled_feature_matrix.columns[index], 
                            decomposer.components_[topic_id, index])
    print(s + '\n')

mand (1.963331) bog (0.131437) Henri Matisse (0.125221) pels (0.124950) Korn (0.082426) dreng (0.081296) Skovl (0.080440) pibe (0.066412) hus (0.064825) pipekrave, pibekrave (0.064562) 

kvinde (1.289157) tørklæde (0.070042) blomst (0.067996) Læsning, læsning (0.067872) siddende (0.037851) Lampe, lampe (0.034136) kirke (0.033609) Døden (0.031372) Karrebæksminde (0.030374) gade (0.027582) 

træ (0.981421) kirke (0.320731) hus (0.121348) tamkvæg (0.104762) Gadekær, gadekær (0.102557) Grusvej (0.089316) Laterankirken (0.088193) skov (0.066877) Bænk, bænk (0.054400) Arresø (0.042793) 

himmel (0.949517) sky (0.730637) sejlskib (0.136593) strand (0.102515) vej (0.095980) Fugle (0.093045) hytte (0.089223) København (0.080454) Solnedgang, solnedgang (0.080072) Rug (0.078543) 

hund (1.436536) bog (0.107591) dreng (0.106830) barn (0.102863) smagssans (0.099697) kurv (0.091705) Bjørne (0.088362) haglgevær (0.059677) Jæger, jæger (0.059538) bjerg (0.048947) 

Ida Ilsted (1.190453) hat (0.403747)

In [32]:
topic_ids = transformed.shape[1]
for topic_id in range(topic_ids):
    print('\nTopic %d' % (topic_id + 1,))
    indices = (-decomposer.components_[topic_id, :]).argsort()[:5]
    s = ""
    for index in indices:
        s += "%s (%f) " %  (scaled_feature_matrix.columns[index], 
                            decomposer.components_[topic_id, index])
    print(s + '\n')
    indices = (-transformed[:, topic_id]).argsort()[:20]
    for index in indices:
        print("%f %s" % (transformed[index, topic_id], artwork_mapper[feature_matrix.index[index]]))



Topic 1
kvinde (1.972915) tørklæde (0.107192) blomst (0.104061) Læsning, læsning (0.103872) siddende (0.057891) 

0.407422 Modellen Maddalena
0.407422 Dame ved sit toilette
0.407422 Olga Buhre
0.232826 Portræt af gammel bondekone
0.231899 Læsende dame
0.231648 Tahitian Woman with a Flower
0.223739 Portræt af en ung pige. Kunstnerens søster, Anna Hammershøi
0.217047 Dame med hund
0.157807 Dame ved Karrebæksminde Strand
0.156689 Aften. Den gamle kone og døden
0.155477 Hjemvendt familie på et torv i måneskin
0.154856 Underkirken i klosteret San Benedetto i Subiaco
0.152461 I havedøren. Kunstnerens hustru
0.151615 Skumring. Kunstnerens hustru ved kakkelovnen
0.150184 Sommeraften ved Skagen, kunstnerens hustru med hund ved strandkanten
0.123173 Liggende kvinde
0.121477 Blind kone i sin stue
0.118949 "Mon han dog ikke skulle komme?"
0.118949 Pigen i køkkenet
0.102143 To veninder

Topic 2
himmel (1.046733) sky (0.805230) sejlskib (0.150535) strand (0.112983) vej (0.105755) 

0.314935 Udsigt 

In [40]:
# Write part of an HTML file with image grouped accoring to topic
with open('tmp.html', 'w') as f:
    topic_ids = transformed.shape[1]
    for topic_id in range(topic_ids):
        f.write('<h3>Emne %d</h3>\n' % (topic_id + 1,))
        indices = (-decomposer.components_[topic_id, :]).argsort()[:5]
        s = ""
        for index in indices:
            s += "%s. " %  (scaled_feature_matrix.columns[index],)
        f.write('Motiver: ' + s.encode('utf-8') + '<br/>\n')
        indices = (-transformed[:, topic_id]).argsort()[:15]
        for index in indices:
            qid = int(feature_matrix.index[index][32:])
            filename = filename_mapper[feature_matrix.index[index]]
            if filename:
                f.write("<a href='http://wikidata.org/wiki/Q%d'><img src='https://commons.wikimedia.org/w/thumb.php?f=%s&width=200'></a>\n" % (qid, filename,))


In [34]:
# Graph with 'motivs' as nodes
graph = nx.Graph()
for motiv in feature_matrix.columns:
    graph.add_node(motiv)
    
for image, row in feature_matrix.iterrows():
    motivs = row.index[row.nonzero()[0]].tolist()
    for i1 in range(len(motivs) - 1):
        for i2 in range(i1 + 1, len(motivs)):
            graph.add_edge(motivs[i1], motivs[i2])

In [35]:
# Good position layout is always a problem - here default spring layout is attempted
# pos = nx.layout.spectral_layout(graph)
pos = nx.layout.spring_layout(graph, iterations=50)

In [38]:
# Plotting the motiv graph
node_sizes = 50 * feature_matrix.sum(axis=0)[graph.nodes()]
nx.draw_networkx_nodes(graph, pos=pos, node_size=node_sizes, node_color='r', alpha=0.5, linewidths=0)
nx.draw_networkx_edges(graph, nodelist=graph.nodes(), pos=pos, alpha=0.05, color='r', linewidths=3)

positions = [pos[node] for node in graph.nodes()]
for i in np.argsort(node_sizes)[:-31:-1]:
    plt.text(positions[i][0], positions[i][1], graph.nodes()[i], 
             horizontalalignment='center', verticalalignment='center')

ax = plt.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_position([0.0, 0.0, 1, 1])
ax.axis([-0.05, 1.05, -0.05, 1.05])

plt.text(0.5, 0.9, u'Motiver i danske kunstværker i Wikidata',
         fontsize=50,
         backgroundcolor=(1, 1, 1),
         horizontalalignment='center', verticalalignment='center')

<matplotlib.text.Text at 0x7f5a807f8bd0>

In [39]:
# Save the image in a file
plt.gcf().set_size_inches(18, 12, forward=True)
plt.savefig('Danish art motivs from Wikidata.png')

In [37]:
plt.show()