In [2]:
# requirements
!pip install -q --quiet rdflib sparqlwrapper owlrl gdown pandas wikipedia-api
!mkdir -p data
!pip freeze > requirements.txt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/528.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m276.5/528.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import wikipediaapi
import gdown
import os
import owlrl
import csv
import pandas as pd
from __future__ import print_function
from rdflib import Graph, URIRef, BNode, Literal, FOAF as foaf, RDF as rdf, RDFS as rdfs, XSD as xsd,OWL as owl, Namespace
from SPARQLWrapper import SPARQLWrapper

In [4]:
# create directory for data
if not os.path.exists("./data/"):
    os.mkdir("./data/")

In [5]:
# Download data
gdown.download(id="1UILSMenvqzFMsIhV6l4ZV1opJpRQZnwC", output="./data/rym_top_5000_all_time.csv", quiet=True)
gdown.download(id="1UNJ2iDX-xoWIekSA0ZwoqNOk3eAZjqrp", output="./data/albumlist.csv", quiet=True)

'./data/albumlist.csv'

# **KRR Module 2 2022/23 Project**

The project consists in the creation of an RDF knowledge graph starting from tabular data (two CSV files loaded below).

You should cover all of the following steps.

### **`1. Create an RDFLib Graph from data contained in CSV files`**

In steps:
1) Load the CSV files in two pandas DataFrames and join them
2) Create an RDFLib `Graph` and add triples from the data of the dataframes

Useful links:

- [Pandas user guide](https://pandas.pydata.org/docs/user_guide/index.html) and [documentation](https://pandas.pydata.org/docs/reference/), e.g.:
    - [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html)
    - [Join dataframes](https://pandas.pydata.org/docs/user_guide/10min.html#join)
    - [Splitting and replacing strings](https://pandas.pydata.org/docs/user_guide/text.html#splitting-and-replacing-strings)
    - [Apply a function over an axis of a DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html?highlight=apply#pandas.DataFrame.apply)

In [6]:
df1 = pd.read_csv('data/albumlist.csv', encoding='mac_roman')
df1.head()

Unnamed: 0,Number,Year,Album,Artist,Genre,Subgenre
0,1,1967,Sgt. Pepper's Lonely Hearts Club Band,The Beatles,Rock,"Rock & Roll, Psychedelic Rock"
1,2,1966,Pet Sounds,The Beach Boys,Rock,"Pop Rock, Psychedelic Rock"
2,3,1966,Revolver,The Beatles,Rock,"Psychedelic Rock, Pop Rock"
3,4,1965,Highway 61 Revisited,Bob Dylan,Rock,"Folk Rock, Blues Rock"
4,5,1965,Rubber Soul,The Beatles,"Rock, Pop",Pop Rock


In [7]:
df2 = pd.read_csv('data/rym_top_5000_all_time.csv')
df2.head()

Unnamed: 0,Ranking,Album,Artist Name,Release Date,Genres,Descriptors,Average Rating,Number of Ratings,Number of Reviews
0,1.0,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock","melancholic, anxious, futuristic, alienation, ...",4.23,70382,1531
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock","melancholic, atmospheric, progressive, male vo...",4.29,48662,983
2,3.0,In the Court of the Crimson King,King Crimson,10 October 1969,"Progressive Rock, Art Rock","fantasy, epic, progressive, philosophical, com...",4.3,44943,870
3,4.0,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic","cold, melancholic, futuristic, atmospheric, an...",4.21,58590,734
4,5.0,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","political, conscious, poetic, protest, concept...",4.27,44206,379


In [8]:
df1.shape, df2.shape

((500, 6), (5000, 9))

In [9]:
df = pd.merge(df1, df2, on=["Album"])
# picked just the following columns to avoid repeated colmuns
df = df[["Number", "Album", "Artist", "Genres","Subgenre" , "Descriptors", "Ranking", "Release Date", "Average Rating", "Number of Ratings", "Number of Reviews"]]
df.head()

Unnamed: 0,Number,Album,Artist,Genres,Subgenre,Descriptors,Ranking,Release Date,Average Rating,Number of Ratings,Number of Reviews
0,1,Sgt. Pepper's Lonely Hearts Club Band,The Beatles,"Psychedelic Pop, Pop Rock","Rock & Roll, Psychedelic Rock","psychedelic, playful, melodic, male vocals, wa...",24.0,1 June 1967,4.13,43576,863
1,2,Pet Sounds,The Beach Boys,Baroque Pop,"Pop Rock, Psychedelic Rock","Wall of Sound, warm, bittersweet, love, romant...",20.0,16 May 1966,4.18,36305,727
2,3,Revolver,The Beatles,"Pop Rock, Psychedelic Pop","Psychedelic Rock, Pop Rock","psychedelic, melodic, male vocals, drugs, ecle...",11.0,5 August 1966,4.23,43178,1160
3,4,Highway 61 Revisited,Bob Dylan,"Folk Rock, Singer/Songwriter","Folk Rock, Blues Rock","poetic, cryptic, surreal, sarcastic, male voca...",35.0,30 August 1965,4.17,26338,604
4,5,Rubber Soul,The Beatles,Pop Rock,Pop Rock,"melodic, love, male vocals, bittersweet, roman...",73.0,3 December 1965,4.03,33508,721


Create an RDFLib `Graph` and add triples from the data of the dataframes**

For example, add triples like the following (but not only):

- `[album] dbp:artist [artist]`,
- `[album] rdfs:label [album_name]`,
- `[album] rdf:type [genre]`

Make sure you use URIs and Literals correctly.

In [10]:
from pandas._config import describe_option

rdf_graph = Graph()

ex = Namespace('http://example.org/')
dbr = Namespace('http://dbpedia.org/resource/')
dbo = Namespace('http://dbpedia.org/ontology/')
dbp = Namespace('http://dbpedia.org/property/')

rdf_graph.bind("ex", 'http://example.org/')
rdf_graph.bind("dbr", 'http://dbpedia.org/resource/')
rdf_graph.bind("dbo", 'http://dbpedia.org/ontology/')
rdf_graph.bind("dbp", 'http://dbpedia.org/property/')

triples = []

for index, row in df.iterrows():
  # replaced " "(space) with "_" to be able to store as a single element in the triple
  artist = row["Artist"].replace(" ","_").replace("\"","")
  album = row["Album"].replace(" ","_").replace("\"","")
  genres = row["Genres"].replace(" ","_").replace("\"","").split(",") + row["Subgenre"].replace(" ","_").replace("\"","").split(",")
  descriptors = row["Descriptors"].replace(" ","_").replace("\"","").split(",")

  triples.append((dbr[album], dbp.artist, dbr[artist]))
  triples.append((dbr[album], rdfs.label, Literal(album)))

  for genre in genres:
    triples.append((dbr[album], dbp.genre, dbp[genre]))

  for descriptor in descriptors:
    triples.append((dbr[album], dbp.description, dbp[descriptor]))


for triple in triples:
  rdf_graph.add(triple)

print(rdf_graph.serialize())


@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://dbpedia.org/resource/#1_Record> rdfs:label "#1_Record" ;
    dbp:artist dbr:Big_Star ;
    dbp:description dbp:_bittersweet,
        dbp:_love,
        dbp:_male_vocals,
        dbp:_optimistic,
        dbp:_playful,
        dbp:_romantic,
        dbp:_sentimental,
        dbp:_summer,
        dbp:_uplifting,
        dbp:melodic ;
    dbp:genre dbp:Power_Pop,
        dbp:_Folk_Pop .

<http://dbpedia.org/resource/(What's_the_Story)_Morning_Glory?> rdfs:label "(What's_the_Story)_Morning_Glory?" ;
    dbp:artist dbr:Oasis ;
    dbp:description dbp:_anthemic,
        dbp:_bittersweet,
        dbp:_dense,
        dbp:_energetic,
        dbp:_lush,
        dbp:_male_vocals,
        dbp:_passionate,
        dbp:_uplifting,
        dbp:_warm,
        dbp:melodic ;
    dbp:genre dbp:Brit_Pop,
        dbp:Britpop .

dbr:1999 rdfs:label "1999

### **`2. Integrate your data with DBpedia's`**

You MUST extend the knowledge you already have on your local KG with the following from DBpedia:
- band members
- city of birth of members
- country of birth of members
- founding city of the band
- founding country of the band

You can also add more information regarding albums (optional)



Recommendations and hints:
- How to find IRIs from DBpedia that match the entities in the local KG?
  1) guessing the IRI with simple tricks:
     - the album 'Rubber Soul' on DBpedia is `dbr:Rubber_Soul`, and this 'translation' may work for other albums (don't do it manually)
     - the album 'Led Zeppelin' on DBpedia is `dbr:Led_Zeppelin_(album)` as the same name is also associated to the band
  2) through their `rdfs:label`: for instance, the artist [Amy Winehouse](https://en.wikipedia.org/wiki/Amy_Winehouse) has 'Amy Winehouse' as label on DBpedia and therfore after the matching the IRI of the entity can be extracted
  3) using the [DBpedia lookup endpoint](https://github.com/dbpedia/lookup). The following request tries to find the DBpedia entity for the artist [Fabrizio De Andrè](https://en.wikipedia.org/wiki/Fabrizio_De_Andr%C3%A9): https://lookup.dbpedia.org/api/search/KeywordSearch?QueryString=fabrizio%20de%20andre&format=json&MaxHits=5
- restrict the search to entities of type 'Album', 'Band', 'Country', etc, in order to avoid overloading DBPedia's endpoint (see [here](http://mappings.dbpedia.org/server/ontology/classes/) for a list of DBPedia classes) and thus avoid high latency or being banned
- note that sometimes useful properties to reach entities have the `dbp` prefix which is different from `dbo` ([more info](https://parklize.blogspot.com/2016/05/dbpedia-difference-between-dbo-and-dbp.html)). It's ok to use both
- here is a **non-exaustive** list of useful properties: `dbp:hometown`, `dbo:birthPlace`, `dbo:country`, `dbo:formerBandMember`
- how to find properties that link two entities of our interest?
  1) by checking the [DBpedia ontology](https://akswnc7.informatik.uni-leipzig.de/dstreitmatter/archivo/dbpedia.org/ontology/2023.05.12-020000/ontology_type=parsed.owl) manually
  2) by checking the DBpedia ontology with Protégé
  3) using the DBpedia resource pages and exploiting your knowledge (or Wikipedia information). For instance, let us consider [Metallica](https://en.wikipedia.org/wiki/Metallica), I know that [Dave Mustaine](https://en.wikipedia.org/wiki/Dave_Mustaine) was a member, therefore I ask myself: How are these two entities linked in the DBpedia KG? On the [DBpedia page of Metallica](https://dbpedia.org/page/Metallica) we can find that the property used is `dbo:formerBandMember` thus I can use this property to get band members
  4) running SPARQL queries like the following in the DBpedia endpoint:
      ```
      SELECT DISTINCT ?p
      WHERE {
        ?s a dbo:Band .
        ?o a dbo:MusicalArtist .
        ?s ?p ?o .
      }
      ```
  5) if you are curious enough you can check [ABSTAT](http://abstat.disco.unimib.it/about.html), a tool for KG exploration with a nice interface
- please notice that DBpedia is not perfect so is reccomendable to check the type of the candidates before including them

In [11]:
from SPARQLWrapper import SPARQLWrapper

sparql = SPARQLWrapper("https://dbpedia.org/sparql")

triples = []

for artist in df.Artist:
  # query to get some information for each artist
  query = f'''
    SELECT ?bandmember ?birthcity ?birthcountry ?bandcity ?bandcountry
    WHERE {{
      OPTIONAL
        {{
        ?p rdfs:label "{artist}"@en.
        ?p dbo:formerBandMember ?bandmember.
        ?bandmember dbo:birthPlace ?birthcity.
        ?birthcity dbo:country ?birthcountry.
        }}
        OPTIONAL
        {{
          ?p rdfs:label "{artist}"@en.
        ?p dbo:hometown ?bandcity.
        ?bandcity dbo:country ?bandcountry.
        }}
    }}
  '''
  artist = artist.replace(' ',"_").replace("\"","")

  sparql.setQuery(query)
  sparql.setReturnFormat("csv")
  results = sparql.query().convert()

  # convert the results to a pandas dataframe so i iterate through them easily

  with open("results.csv", 'wb') as f: f.write(results)

  df_scratch = pd.read_csv("results.csv")

  for index, row in df_scratch.iterrows():

    # try and except because some names are written strangely so will give me an error otherwise

    try:
      bandmember = row["bandmember"].split("/")[-1:]
      birthcity = row["birthcity"].split("/")[-1:]
      birthcountry = row["birthcountry"].split("/")[-1:]
      bandcity = row["bandcity"].split("/")[-1:]
      bandcountry = row["bandcountry"].split("/")[-1:]
      triples.append((dbr[artist], dbo.formerBandMember, dbr[bandmember[0]]))
      triples.append((dbr[bandmember[0]], dbo.birthPlace, dbr[birthcity[0]]))
      triples.append((dbr[birthcity[0]], dbo.country, dbr[birthcountry[0]]))
      triples.append((dbr[artist], dbo.hometown, dbr[bandcity[0]]))
      triples.append((dbr[bandcity[0]], dbo.country, dbr[bandcountry[0]]))
    except:
      pass


for triple in triples:
  rdf_graph.add(triple)
print(rdf_graph.serialize())


@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://dbpedia.org/resource/#1_Record> rdfs:label "#1_Record" ;
    dbp:artist dbr:Big_Star ;
    dbp:description dbp:_bittersweet,
        dbp:_love,
        dbp:_male_vocals,
        dbp:_optimistic,
        dbp:_playful,
        dbp:_romantic,
        dbp:_sentimental,
        dbp:_summer,
        dbp:_uplifting,
        dbp:melodic ;
    dbp:genre dbp:Power_Pop,
        dbp:_Folk_Pop .

<http://dbpedia.org/resource/(What's_the_Story)_Morning_Glory?> rdfs:label "(What's_the_Story)_Morning_Glory?" ;
    dbp:artist dbr:Oasis ;
    dbp:description dbp:_anthemic,
        dbp:_bittersweet,
        dbp:_dense,
        dbp:_energetic,
        dbp:_lush,
        dbp:_male_vocals,
        dbp:_passionate,
        dbp:_uplifting,
        dbp:_warm,
        dbp:melodic ;
    dbp:genre dbp:Brit_Pop,
  

### **`3. Gathering Information from unstructured data`**

Knowledge can be gathered from unstructured data as well as structured data. A great encyclopaedic source is Wikipedia whose articles follow a very strict pattern. For instance, the first lines about a person are reserved for biographical data and his or her occupation, therefore we can exploit it to understand the role of a band member and add this information to our KG.

In steps:
1) get the article from Wikipedia and extract the summary. You can use [Wikipedia-api](https://pypi.org/project/Wikipedia-API/) (see the code below)
2) extract the band member role (guitarist, bassist, keyboardist, singer, drummer, etc) using rules
3) add this information to the KG

Recommendations and hints:
- some artists have multiple roles, add them all
- some artists have multiple bands, it is not required to understand the role for each band, keep it simple.
- extraction rules can be very simple (simple string matchers + if-else)  or more complex ones (e.g., regex, ChatGPT requests), it doesn't matter in terms of points.


In [12]:
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia('en')
page_py = wiki_wiki.page('Kurt Cobain')

if page_py.exists():
    summary = page_py.summary[:200]
    print("Page - Summary: %s" % summary)
else:
    print("Page does not exist.")

Page - Summary: Kurt Donald Cobain (February 20, 1967 – c. April 5, 1994) was an American musician who was the co-founder, lead vocalist, guitarist and primary songwriter of the rock band Nirvana. Through his angst-f


In [13]:
# list of roles for extraction

roles = ["guitarist", "bassist", "keyboardist", "singer", "drummer", "songwriter", "vocalist", "pianist"]

bandmembers = []
triples = []

# retrieve every member from the graph

for s, p, o in rdf_graph.triples((None, dbo.formerBandMember, None)):
  member = o.split("/")[-1:]
  member = member[0]
  bandmembers.append((member))


for member in bandmembers:

  member_roles = []

  wiki_wiki = wikipediaapi.Wikipedia('en')
  page_py = wiki_wiki.page(member)

  if page_py.exists():
      summary = page_py.summary[:200]
  else:
      print("Page does not exist.")

  for word in summary.split(" "):
    if word in roles:
      member_roles.append(word)


  for role in member_roles:
    triples.append((dbr[member], dbo.memberrole, dbr[role]))


for triple in triples:
  rdf_graph.add(triple)
print(rdf_graph.serialize())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        dbp:_Psychedelic_Rock .

dbr:Electric_Warrior rdfs:label "Electric_Warrior" ;
    dbp:artist dbr:T._Rex ;
    dbp:description dbp:_energetic,
        dbp:_love,
        dbp:_male_vocals,
        dbp:_melodic,
        dbp:_passionate,
        dbp:_playful,
        dbp:_sensual,
        dbp:_space,
        dbp:_warm,
        dbp:sexual ;
    dbp:genre dbp:Glam,
        dbp:Glam_Rock,
        dbp:_Classic_Rock .

dbr:Elephant rdfs:label "Elephant" ;
    dbp:artist dbr:The_White_Stripes ;
    dbp:description dbp:_female_vocals,
        dbp:_love,
        dbp:_male_vocals,
        dbp:_melodic,
        dbp:_minimalistic,
        dbp:_playful,
        dbp:_raw,
        dbp:_rebellious,
        dbp:_romantic,
        dbp:energetic ;
    dbp:genre dbp:Blues_Rock,
        dbp:Garage_Rock_Revival,
        dbp:_Alternative_Rock,
        dbp:_Garage_Rock .

<http://dbpedia.org/resource/Entertainment!> rdfs:label "Entertainmen

### **`4. Include a small ontology and a  genres taxonomy`**

Using RDFlib, add classes such as Person, MusicAlbum, and so on to your KG. Add the domain and range for each property you use.

This will be needed later for inferring the types of the entities of your KG.

In [14]:
# just adding triples to the graph
triples = [
    (dbr.Person, rdf.type, rdfs.Class),
    (dbr.Artist , rdf.type, rdfs.Class),
    (dbr.MusicAlbum, rdf.type, rdfs.Class),
    (dbr.Genre , rdf.type, rdfs.Class),
    (dbr.Subgenre, rdf.type, rdfs.Class),
    (dbr.Descriptors , rdf.type, rdfs.Class),
    (dbr.Country, rdf.type, rdfs.Class),
    (dbr.City, rdf.type, rdfs.Class),
    (dbr.Role, rdf.type, rdfs.Class),

    (dbp.artist , rdfs.domain, dbr.MusicAlbum),
    (dbp.artist, rdfs.range, dbr.Artist),

    (dbp.genre , rdfs.domain, dbr.MusicAlbum),
    (dbp.genre, rdfs.range, dbr.Genre),

    (dbp.description , rdfs.domain, dbr.MusicAlbum),
    (dbp.description, rdfs.range, dbr.Descriptors),

    (dbo.formerBandMember , rdfs.domain, dbr.Artist),
    (dbo.formerBandMember, rdfs.range, dbr.Person),

    (dbo.birthPlace, rdfs.domain, dbr.Person),
    (dbo.birthPlace, rdfs.range, dbr.City),

    (dbo.country, rdfs.domain, dbr.City),
    (dbo.country, rdfs.range, dbr.Country),

    (dbo.hometown, rdfs.domain, dbr.Artist),
    (dbo.hometown, rdfs.range, dbr.City),

    (dbo.memberrole, rdfs.domain, dbr.Person),
    (dbo.memberrole, rdfs.range, dbr.Role)
]

for triple in triples:
  rdf_graph.add(triple)
print(rdf_graph.serialize())

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

dbr:Artist a rdfs:Class .

dbr:City a rdfs:Class .

dbr:Country a rdfs:Class .

dbr:Descriptors a rdfs:Class .

dbr:Genre a rdfs:Class .

dbr:MusicAlbum a rdfs:Class .

dbr:Person a rdfs:Class .

dbr:Role a rdfs:Class .

dbr:Subgenre a rdfs:Class .

dbo:birthPlace rdfs:domain dbr:Person ;
    rdfs:range dbr:City .

dbo:country rdfs:domain dbr:City ;
    rdfs:range dbr:Country .

dbo:formerBandMember rdfs:domain dbr:Artist ;
    rdfs:range dbr:Person .

dbo:hometown rdfs:domain dbr:Artist ;
    rdfs:range dbr:City .

dbo:memberrole rdfs:domain dbr:Person ;
    rdfs:range dbr:Role .

dbp:artist rdfs:domain dbr:MusicAlbum ;
    rdfs:range dbr:Artist .

dbp:description rdfs:domain dbr:MusicAlbum ;
    rdfs:range dbr:Descriptors .

dbp:genre rdfs:domain dbr:MusicAlbum ;
    rdfs:range dbr:Genre .

<

Moreover, the first dataframe has 'Genre' and 'Subgenre' columns.
Extract a genres taxonomy from the dataset and include it in your RDF KG. This will be needed later for inferring genres from subgenres.

E.g.: add triples to the graph like:

- `[subgenre] rdfs:subClassOf [genre]`.
- `[album] rdf:type [genre]`.

This should be done for all genres and subgenres (with an iteration, not manually).

In [16]:
# iterate through every genre add the corresponding triple
# then for each genre iterate through all its subgenres

triples = []

for index, row in df.iterrows():
  album = row["Album"].replace(" ","_").replace("\"","")
  subgenres = row["Subgenre"].replace('/',',').replace(' ','').split(',')
  genres = row["Genres"].replace('/',',').replace(' ','').split(',')
  for genre in genres:
    for subgenre in subgenres:
      triples.append((dbr[subgenre], rdfs.subClassOf, dbr[genre]))
    triples.append((dbr[album], rdf.type, dbr[genre]))


for triple in triples:
    rdf_graph.add(triple)
print(rdf_graph.serialize())

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

dbr:Artist a rdfs:Class .

dbr:City a rdfs:Class .

dbr:Country a rdfs:Class ;
    rdfs:subClassOf dbr:Alt-Country,
        dbr:Americana,
        dbr:Blue-EyedSoul,
        dbr:ContemporaryFolk,
        dbr:Country,
        dbr:CountrySoul,
        dbr:OutlawCountry,
        dbr:Singer,
        dbr:Songwriter .

dbr:Descriptors a rdfs:Class .

dbr:Genre a rdfs:Class .

dbr:MusicAlbum a rdfs:Class .

dbr:Person a rdfs:Class .

dbr:Role a rdfs:Class .

dbr:Subgenre a rdfs:Class .

dbo:birthPlace rdfs:domain dbr:Person ;
    rdfs:range dbr:City .

dbo:country rdfs:domain dbr:City ;
    rdfs:range dbr:Country .

dbo:formerBandMember rdfs:domain dbr:Artist ;
    rdfs:range dbr:Person .

dbo:hometown rdfs:domain dbr:Artist ;
    rdfs:range dbr:City .

dbo:memberrole rdfs:domain dbr:Person ;
    rdfs

### **`5. Materialize inferences`**

Add to your KG inferences that you can make from the data on your KG (e.g., with SPARQL queries, or with OWLRL, ...).

**<ins>Some</ins>**  **<ins>examples</ins>**
- if you have that `[album] rdf:type [genre1]` and `[genre1] rdfs:subClassOf [genre2]`, add the triple `[album] rdf:type [genre2]`. Same for domains, ranges, subclasses, subproperties, OWL features, etc.
- if the nationality of a group member is missing, you can infer it from the nationality of the band

Run a few SPARQL queries on the graph to see the inferred triples.


In [17]:
# expand the graph with what can be inferred
owlrl.DeductiveClosure(owlrl.RDFS_Semantics).expand(rdf_graph)
print(rdf_graph.serialize())

@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

dbr:Artist a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbr:Artist,
        rdfs:Resource .

dbr:City a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbr:City,
        rdfs:Resource .

dbr:Country a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbr:Alt-Country,
        dbr:Americana,
        dbr:Blue-EyedSoul,
        dbr:ContemporaryFolk,
        dbr:Country,
        dbr:CountrySoul,
        dbr:OutlawCountry,
        dbr:Singer,
        dbr:Songwriter,
        rdfs:Resource .

dbr:Descriptors a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbr:Descriptors,
        rdfs:Resource .

dbr:Genre a rdfs:Class,
        rdfs:Resource ;
    rdfs:subClassOf dbr:Genre,
        rdfs:Resource .

dbr:Mus

In [18]:
# some of the inferred triples
for s, p, o in rdf_graph.triples((None, rdf.type, dbr.Artist)):
 print(s)

http://dbpedia.org/resource/New_York_Dolls
http://dbpedia.org/resource/Elton_John
http://dbpedia.org/resource/Bjork
http://dbpedia.org/resource/The_Paul_Butterfield_Blues_Band
http://dbpedia.org/resource/The_Smashing_Pumpkins
http://dbpedia.org/resource/N.W.A
http://dbpedia.org/resource/My_Bloody_Valentine
http://dbpedia.org/resource/Sex_Pistols
http://dbpedia.org/resource/Bob_Marley_&_The_Wailers
http://dbpedia.org/resource/Santana
http://dbpedia.org/resource/Kraftwerk
http://dbpedia.org/resource/Big_Brother_&_the_Holding_Company
http://dbpedia.org/resource/Little_Richard
http://dbpedia.org/resource/T._Rex
http://dbpedia.org/resource/Guns_N'_Roses
http://dbpedia.org/resource/The_Kinks
http://dbpedia.org/resource/Sonic_Youth
http://dbpedia.org/resource/Metallica
http://dbpedia.org/resource/Elvis_Presley
http://dbpedia.org/resource/The_Jimi_Hendrix_Experience
http://dbpedia.org/resource/Patti_Smith
http://dbpedia.org/resource/Otis_Redding
http://dbpedia.org/resource/Red_Hot_Chili_Pepper

### **`6. Query the Graph`**

Query your graph in a way that:
- the usefulness of having added information from dbpedia is highlighted
- inference is exploited


**<ins>Some</ins>**  **<ins>examples</ins>** with the intentions:


| ID | Query | Intention/insight |
|----|------|-------------------|
| 1  | which bands have at least one member with a different nationality? | just curiosity |
| 2  | Which are the artist with many bands? | to spot session musicians, long-career artists, 'eclectic' artists |
| 3  | which are the countries with more [ Latin Rock ]( https://en.wikipedia.org/wiki/Latin_rock ) bands? and what about [ Black Metal ]( https://en.wikipedia.org/wiki/Black_metal )? | I suppose that latin rock is more popular in Latin America countries while Black Metal in Scandinavian countries, but I may be wrong! |
| 4  | Is there any artist who is a member of both a [ Britpop ]( https://en.wikipedia.org/wiki/Britpop ) band and a [ Hip hop ]( https://en.wikipedia.org/wiki/Hip_hop_music ) band | maybe to spot bands/artists with a strong musical contamination? e.g., [Damon Albarn ](https://en.wikipedia.org/wiki/Damon_Albarn) |
| 5  | Which is the city with more bands in absolute? and what if we consider only [ Grunge ]( https://en.wikipedia.org/wiki/Grunge )? | maybe to discover a music scene or just a very influential city. |
| 6  | Give me some band/artist from the same city of [ 2Pac ]( https://en.wikipedia.org/wiki/Tupac_Shakur ) | could be a way to find similar musical artists. What about adding also a time period and the musical genre? |
| 7  | which are the bands with Argentine and Spanish members? | maybe because I love both the Argentine and Spanish sound and I hope there is something that matches both? e.g., [Los Rodriguez](https://en.wikipedia.org/wiki/Los_Rodr%C3%ADguez) |




**Feel free to play and get cool insights**

In [19]:
# bands with atleast one member with a different nationality

query1 = f"""
    SELECT DISTINCT ?band
    WHERE {{
        ?band dbo:formerBandMember ?musician1.
        ?musician1 dbo:birthPlace/dbo:country ?country1.

        ?band dbo:formerBandMember ?musician2.
        ?musician2 dbo:birthPlace/dbo:country ?country2.

        FILTER(?musician1 != ?musician2 && ?country1 != ?country2)
    }}
"""

results = rdf_graph.query(query1)

for res in results:
  print(res)

(rdflib.term.URIRef('http://dbpedia.org/resource/The_Clash'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Pogues'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Fleetwood_Mac'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Derek_and_the_Dominos'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Who'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Ramones'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/AC/DC'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Zombies'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Kinks'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Talking_Heads'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Metallica'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Funkadelic'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Kraftwerk'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Jesus_and_Mary_Chain'),)
(rdflib.ter

In [20]:
# artists associated with many bands

query2 = f"""
    SELECT ?musician
    WHERE {{
        ?band dbo:formerBandMember ?musician.
    }}
    GROUP BY ?musician
    HAVING (COUNT(?band) > 1)
"""

results = rdf_graph.query(query2)

for res in results:
  print(res)

(rdflib.term.URIRef('http://dbpedia.org/resource/Joe_Strummer'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Dave_Mason'),)
(rdflib.term.URIRef('http://dbpedia.org/resource/Jim_Rodford'),)


In [22]:
# countries with Latin rock bands

query3= f"""
    SELECT ?country (COUNT(?band) AS ?countband)
    WHERE {{
        ?band dbo:formerBandMember ?x.
        ?album dbp:artist ?band.
        ?album rdf:type dbr:LatinRock.
        ?band dbo:hometown ?city.
        ?city dbo:country ?country.
    }}
    GROUP BY ?country
    ORDER BY DESC(COUNT(?band))
"""
results = rdf_graph.query(query3)

for res in results:
  print(res)

(rdflib.term.URIRef('http://dbpedia.org/resource/United_Kingdom'), rdflib.term.Literal('102', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/United_States'), rdflib.term.Literal('79', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/Australia'), rdflib.term.Literal('16', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/Republic_of_Ireland'), rdflib.term.Literal('3', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))


In [23]:
# artist who belong to both a brit pop and hip pop band

query4 = f"""
    SELECT ?musician
    WHERE {{
        ?band1 dbo:formerBandMember ?musician.
        ?band2 dbo:formerBandMember ?musician.
        ?album1 dbp:artist ?band1.
        ?album1 a dbr:BritPop.
        ?album2 dbp:artist ?band2.
        ?album2 a dbr:HipHop.
    }}
"""
results = rdf_graph.query(query4)

for res in results:
  print(res)

# this query has no output

In [24]:
# city with more bands

query5 = f"""
    SELECT ?city (COUNT(?band) AS ?countband)
    WHERE {{
        ?band dbo:hometown ?city.
    }}
    GROUP BY ?city
    ORDER BY DESC(COUNT(?band))
"""
results = rdf_graph.query(query5)

for res in results:
  print(res)

(rdflib.term.URIRef('http://dbpedia.org/resource/London'), rdflib.term.Literal('8', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/New_York_City'), rdflib.term.Literal('6', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/Los_Angeles'), rdflib.term.Literal('3', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/Forest_Hills,_Queens'), rdflib.term.Literal('2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/California'), rdflib.term.Literal('2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://dbpedia.org/resource/Massachusetts'), rdflib.term.Literal('2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.UR

In [25]:
# artists from the same city as 2pac
# from dbpedia lookup I found that 2pac is from New York City.
# otherwise in my knowledge graph there was no way of finding that out.

query6 = f"""
    SELECT ?band ?musician
    WHERE {{
      ?band dbo:hometown dbr:New_York_City.
      ?musician dbo:birthPlace dbr:New_York_City.
    }}
"""
results = rdf_graph.query(query6)

for res in results:
    print(res)


(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Lou_Reed'))
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Moe_Tucker'))
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Johnny_Ramone'))
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Phife_Dawg'))
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Eddie_Hazel'))
(rdflib.term.URIRef('http://dbpedia.org/resource/The_Velvet_Underground'), rdflib.term.URIRef('http://dbpedia.org/resource/Cliff_Martinez'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Ramones'), rdflib.term.URIRef('http://dbpedia.org/resource/Lou_Reed'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Ram

In [26]:
# bands with both argentine and spanish members

query7 = f"""
    SELECT ?band
    WHERE {{
        ?band dbo:formerBandMember ?musician1.
        ?band dbo:formerBandMember ?musician2.
        ?musician1 dbo:birthPlace/dbo:country dbr:Argentina.
        ?musician2 dbo:birthPlace/dbo:country dbr:Spain.
    }}
"""
results = rdf_graph.query(query7)

for res in results:
    print(res)


### **`Notes`**
- attempts to enrich the data (e.g., using Wikipedia articles, other KGs and data sources) will be positively evaluated, but only if enrichment in steps 2 and 3 has been sufficiently explored
- sometimes matching local entities with DBpedia entities is difficult: do not worry too much about that, we will give more importance to the choices and motivations rather than the end result of the matching
- as you can see, the instructions leave enough freedom, however in case of non-obvious clarifications or technical issues contact me at renzo.alvaprincipe@unimib.it or use the course forum. Questions about how to do the exercise will be ignored
- plagiarism will be **<ins>severely</ins>** punished for both parties