# HowLongToBeat RDF Creator

We load the generated CSV files and we serialize all the data into ***turtle format  (TTL)*** relying on ***RDFLib*** Python library.

## Setup

We import all the necessary libraries and we set the paths to the input/output files. In particular, we create a TTL file for each type of data.

In [1]:
# Imports
import os
from pathlib import Path
import pandas as pd
from datetime import datetime

# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace

# RDFLib knows about some namespaces, like XSD
from rdflib.namespace import XSD

In [2]:
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "cleaned_datasets")
rawdatasetsPath = os.path.join(absPath, "raw_datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
gamesPath = os.path.join(datasetsPath, "games_cleaned.csv")
vgchartzPath = os.path.join(datasetsPath, "vgchartz_cleaned.csv")
indiegamesdevelopersPath = os.path.join(datasetsPath, "indiegamesdevelopers_cleaned_seriesExplode.csv")
platformsPath = os.path.join(datasetsPath,"platforms.csv" )
videoGameDevelopersPath = os.path.join(datasetsPath, "videogamesdevelopers_cleaned_seriesexplode.csv" )

# Countries-Regions path
countriesRegionsPath = os.path.join(datasetsPath, "countries-regions.csv")

# Setup Turtle paths
genresTTLPath = os.path.join(rdfPath, "genres.ttl")
gamesTTLPath = os.path.join(rdfPath, "games.ttl")
companyTTLPath = os.path.join(rdfPath, "company.ttl")
platformsTTLPath = os.path.join(rdfPath, "platforms.ttl")
videoGameDevelopersTTLPath = os.path.join(rdfPath, "videoGameDevelopers.ttl")

In [3]:
# Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

# HLTB Ontology
HLTB = Namespace("http://www.semanticweb.org/enrico/ontologies/2022/10/HLTB-db2unipd#")

In [4]:
def createGraph():
    # Create the graph
    g = Graph()

    # Bind the namespaces to a prefix for more readable output
    g.bind("xsd", XSD)
    g.bind("countries", CNS)
    g.bind("hltb", HLTB)

    return g

## Serialization

We serialize the data according to the following workflow:

1. Load the CSV file and iterate through it
2. Create a unique ID by ourself based on the name of the class.
3. Add the node to the graph using the unique ID.
4. Add all the data properties.
5. Add all the object properties.
6. Serialize the data and save them into a TTL file.

### Games

Now serializing the Game class

In [5]:
# Create Graph
g = createGraph()

In [6]:
# Load the CSV files in memory
games = pd.read_csv(gamesPath, sep=",", index_col="title")
vgchartz = pd.read_csv(vgchartzPath, sep=",", index_col="title")
#indiegamesdevelopers = pd.read_csv(indiegamesdevelopersPath, sep=",", index_col="title") Borowei


In [7]:
def createGameID(title):
    # Replace all special chars with "-"
    gameID = ""
    for char in title:
        if char.isalnum():
            gameID += char
        elif len(gameID) > 0 and gameID[-1] != '-':
            gameID += '-'
    if len(gameID) > 0 and gameID[-1] == '-':
        gameID = gameID[:-1]
    #print(gameID.lower())
    return gameID.lower()

In [8]:
# Iterate over the games
games.info()
for title, row in games.iterrows():
    # Create gameID from its title
    gameID = createGameID(title)

    # Create the node to add to the Graph
    Game = URIRef(HLTB[gameID])

    # Add triples using store's add() method.
    g.add((Game, RDF.type, HLTB.Game))

    # Add the title of the game
    g.add((Game, HLTB["title"], Literal(title, datatype=XSD.string)))

    # Add multiplayer focus
    g.add((Game, HLTB["multiplayerFocus"], Literal(pd.notnull(row["coop"]) or pd.notnull(row["versus"]), datatype=XSD.boolean)))

<class 'pandas.core.frame.DataFrame'>
Index: 35922 entries, 688(I) Hunter/Killer to Yooka-Laylee and the Impossible Lair
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        35922 non-null  int64  
 1   id                35922 non-null  int64  
 2   main_story        17324 non-null  float64
 3   main_plus_extras  11631 non-null  float64
 4   completionist     13107 non-null  float64
 5   all_styles        21112 non-null  float64
 6   coop              183 non-null    float64
 7   versus            274 non-null    float64
 8   type              1314 non-null   object 
 9   developers        34080 non-null  object 
 10  publishers        32754 non-null  object 
 11  platforms         24285 non-null  object 
 12  genres            32843 non-null  object 
dtypes: float64(6), int64(2), object(5)
memory usage: 3.8+ MB


## Missing all other data about games

In [9]:
# Save the data in the Turtle format
with open(gamesTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved games TTL file.")

Saved games TTL file.


### Genre

Now serializing the Genre class

In [10]:
# Create Graph
g = createGraph()
# Load the CSV files in memory
genres = pd.read_csv(gamesPath, sep=",", index_col="genres")

def setGenreID(genre):
    genre = str(genre).replace("/",", ").replace("nan","")
    genre=genre.split(", ")
    list=[]
    for i in range(len(genre)):
        list.append([])
        list[i].append(genre[i])
        list[i].append(genre[i].lower().replace("'","").replace(" ", "-"))
    return(list)

In [11]:
for genre, row in genres.iterrows():
    allGenres = setGenreID(genre)
    for iterator in allGenres:
        if not (iterator[0] == " " or iterator[0] == ""):
            Genre = URIRef(HLTB[iterator[1]])
            #Add triples using store's add() method.
            g.add((Genre, RDF.type, HLTB.Genre))
            # Add the name of the genre
            g.add((Genre, HLTB["name"], Literal(iterator[0], datatype=XSD.string)))

In [12]:
# Save genre data in the Turtle format
with open(genresTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

### Platforms

In [13]:
# Create Graph
g = createGraph()
# Load the CSV files in memory
platforms = pd.read_csv(platformsPath, sep=",", index_col="Platform")

In [14]:
def setPlatformID(platform):
    return(platform.lower().replace(" ", "-"))

In [15]:
for platform, row in platforms.iterrows():
    Platform = URIRef(HLTB[setPlatformID(platform)])
    #Add triples using store's add() method.
    g.add((Platform, RDF.type, HLTB.Platform))
    # Add the name of the genre
    g.add((Platform, HLTB["name"], Literal(platform, datatype=XSD.string)))

    #Add popularity if platform is popular
    if(row["Popular"] == True):
        g.add((Platform, HLTB["popular"], Literal(True, datatype=XSD.boolean)))

    #Add release date if present
    if pd.notna(row["Release date"]):
        time = datetime.combine(datetime.strptime(row["Release date"], '%Y-%M-%d'), datetime.min.time())
        g.add((Platform, HLTB["releaseDate"], Literal(time,datatype=XSD.dateTime)))

    #Add CPU information
    if pd.notna(row["CPU"]):
        g.add((Platform, HLTB["cpu"], Literal(row["CPU"],datatype=XSD.string)))

    #Add CPU bit information
    if pd.notna(row["\"Bits\""]):
        bits = row["\"Bits\""].split("-")[0]
        g.add((Platform, HLTB["bits"], Literal(bits,datatype=XSD.int)))

    #Add acronym information
    if pd.notna(row["Acronym"]):
        g.add((Platform, HLTB["acronym"], Literal(row["Acronym"],datatype=XSD.string)))

In [16]:
# Save data in the Turtle format
with open(platformsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

Company

In [17]:
# Create Graph
g = createGraph()
# Load the CSV files in memory
indiegamesdevelopers = pd.read_csv(indiegamesdevelopersPath, sep=",")

In [18]:
#replace space with -, lower case
indiegamesdevelopers['companyID'] = indiegamesdevelopers['Developer'].replace(' ', '-', regex=True).str.lower()


#indiegamesdevelopers.info()
for index, row in indiegamesdevelopers.iterrows():
  # Create the node to add to the Graph
  Company = URIRef(HLTB[row['companyID']])

  # Add triples using store's add() method.
  g.add((Company, RDF.type, HLTB.Company))

  # Add the Company
  g.add((Company, HLTB['officialName'], Literal(row['Developer'], datatype=XSD.string)))


video-game-evelopers

In [19]:
videoGameDevelopers = pd.read_csv(videoGameDevelopersPath, sep=",")

In [22]:
#replace space with -, lower case
videoGameDevelopers['companyID'] = videoGameDevelopers['Developer'].replace(' ', '-', regex=True).str.lower()


#indiegamesdevelopers.info()
for index, row in videoGameDevelopers.iterrows():
  # Create the node to add to the Graph
  Company = URIRef(HLTB[row['companyID']])

  # Add triples using store's add() method.
  g.add((Company, RDF.type, HLTB.Company))

  # Add the Company
  g.add((Company, HLTB["indieDeveloper"], Literal(row['Developer'], datatype=XSD.boolean)))


In [23]:
# Save the data in the Turtle format
with open(companyTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved company TTL file.")

Saved company TTL file.
