# HowLongToBeat RDF Creator

We load the generated CSV files and we serialize all the data into ***turtle format  (TTL)*** relying on ***RDFLib*** Python library.

## Setup

We import all the necessary libraries and we set the paths to the input/output files. In particular, we create a TTL file for each type of data.

In [50]:
# Imports
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# RDFLib knows about some namespaces, like XSD
from rdflib.namespace import XSD

In [51]:
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "cleaned_datasets")
rawdatasetsPath = os.path.join(absPath, "raw_datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
gamesPath = os.path.join(datasetsPath, "games_cleaned.csv")
vgchartzPath = os.path.join(datasetsPath, "vgchartz_cleaned.csv")
indiegamesdevelopersPath = os.path.join(datasetsPath, "indiegamesdevelopers_cleaned_seriesExplode.csv")
platformsPath = os.path.join(datasetsPath, "platforms.csv")
videoGameDevelopersPath = os.path.join(datasetsPath, "videogamesdevelopers_cleaned_seriesexplode.csv")
completionTimePath = os.path.join(datasetsPath, "completion_time.csv")

# Setup raw datasets
rawVGChartsPath = os.path.join(rawdatasetsPath, "vgchartz-7_7_2020.csv")
rawGamesPath = os.path.join(rawdatasetsPath, "games.csv")
rawCountriesRegionsPath = os.path.join(rawdatasetsPath,'countries-regions.csv')

# Countries-Regions path
countriesRegionsPath = os.path.join(datasetsPath, "countries-regions.csv")
countriesPath = os.path.join(absPath, "wikipedia-iso-country-codes.csv")

# Setup Turtle paths
genresTTLPath = os.path.join(rdfPath, "genres.ttl")
gamesTTLPath = os.path.join(rdfPath, "games.ttl")
companyTTLPath = os.path.join(rdfPath, "company.ttl")
platformsTTLPath = os.path.join(rdfPath, "platforms.ttl")
platformsSalesTTLPath = os.path.join(rdfPath, "platformsSales.ttl")

videoGameDevelopersTTLPath = os.path.join(rdfPath, "videoGameDevelopers.ttl")
statsTTLPath = os.path.join(rdfPath, "stats.ttl")
gameSalesTTLPath = os.path.join(rdfPath, "gameSales.ttl")
regionsTTLPath = os.path.join(rdfPath, "regions.ttl")
countriesTTLPath = os.path.join(rdfPath, "countries.ttl")


In [52]:
# Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

# HLTB Ontology
HLTB = Namespace("http://www.semanticweb.org/enrico/ontologies/2022/10/HLTB-db2unipd#")

In [121]:
def createGraph():
    # Create the graph
    g = Graph()

    # Bind the namespaces to a prefix for more readable output
    g.bind("xsd", XSD)
    g.bind("countries", CNS)
    g.bind("hltb", HLTB)

    return g


#create game URI
def createGameID(title):
    # Replace all special chars with "-"
    gameID = ""
    for char in title:
        if char.isalnum():
            gameID += char
        elif len(gameID) > 0 and gameID[-1] != '-':
            gameID += '-'
    if len(gameID) > 0 and gameID[-1] == '-':
        gameID = gameID[:-1]
    #print(gameID.lower())
    return gameID.lower()


#Create genre URI
def setGenreID(genre):  ##first half is the original genres, second half are processed and lowercase
    genre = str(genre).replace("/", ", ").replace("nan", "")
    genre = genre.split(", ")
    list = []
    for i in range(len(genre)):
        list.append([])
        list[i].append(genre[i])
        list[i].append(genre[i].lower().replace("'", "").replace(" ", "-"))
    return (list)


def setPlatformID(platform):
    return platform.lower().replace(" ", "-").replace("/", "-").replace("&", "-")


def setCompanyID(company):
    if ' ' in company:
        return company.replace(' ', '-').lower()
    return company.lower()


def setDeveloperID(developer):
    return developer.replace(' ', '-', regex=True).str.lower()


def setCountryID(country):
    return country.replace(' ', '-', regex=True).str.lower()


def getCountry2Digits(country):

    rawCountriesRegions = pd.read_csv(rawCountriesRegionsPath, sep=",", index_col='name')
    if country in rawCountriesRegions.index:
        return rawCountriesRegions.iloc[rawCountriesRegions.index==country]['alpha-2']
        #return rawCountriesRegions[rawCountriesRegions.index==country]['alpha-2'].values
    return ''

        #return str(rawCountriesRegions[rawCountriesRegions.index==country]['alpha-2']).lower()

    #if country.lower() == 'us' or country.lower() == 'united states of america':
    #    return 'us'
    #if country.lower() == 'South Korea':
    #    return 'kr'

## Serialization

We serialize the data according to the following workflow:

1. Load the CSV file and iterate through it
2. Create a unique ID by ourself based on the name of the class.
3. Add the node to the graph using the unique ID.
4. Add all the data properties.
5. Add all the object properties.
6. Serialize the data and save them into a TTL file.

### Games

Now serializing the Game class

In [54]:
# Create Graph
g = createGraph()

In [55]:
# Load the CSV files in memory
games = pd.read_csv(gamesPath, sep=",", index_col="title")
vgchartz = pd.read_csv(vgchartzPath, sep=",")
platforms = pd.read_csv(platformsPath, sep=",")

merged = pd.merge(vgchartz, platforms, left_on='console', right_on='Acronym', how='left')

In [56]:
# Iterate over the games
for title, row in games.iterrows():
    # Create gameID from its title
    gameID = createGameID(title)

    # Create the node to add to the Graph
    Game = URIRef(HLTB[gameID])

    # Add triples using store's add() method.
    g.add((Game, RDF.type, HLTB.Game))

    # Add the title of the game
    g.add((Game, HLTB["officialName"], Literal(title, datatype=XSD.string)))

    # Add multiplayer focus
    g.add((Game, HLTB["multiplayerFocus"],
           Literal(pd.notnull(row["coop"]) or pd.notnull(row["versus"]), datatype=XSD.boolean)))

    #Add hltb id
    g.add((Game, HLTB["id"], Literal(row["id"], datatype=XSD.int)))

    #Add hasGenre object property
    for iterator in setGenreID(row["genres"]):
        if pd.notnull(iterator[1]) and iterator[1] != '':
            g.add((Game, HLTB["hasGenre"], URIRef(HLTB[iterator[1]])))

    #Add platform availability
    if pd.notna(row["platforms"]):
        for platform in row["platforms"].split(", "):
            g.add((Game, HLTB["releasedOn"], URIRef(HLTB[setPlatformID(platform)])))

            #Add Stats object property
            g.add((Game, HLTB["hasStats"],
                   URIRef(HLTB["stats-" + str(createGameID(title)) + "___" + str(setPlatformID(platform))])))

            # Add Sales object property
            game = merged.loc[(merged['title'] == title) & (merged['Platform'] == platform)]
            if not game.empty:
                if pd.notna(game["pal_sales"].iloc[0]):
                    GameSalesID = URIRef(
                        HLTB["sales-" + str(createGameID(title)) + "___" + str(setPlatformID(platform)) + "___" + "eu"])
                    g.add((Game, HLTB["sold"], GameSalesID))
                if pd.notna(game["na_sales"].iloc[0]):
                    GameSalesID = URIRef(
                        HLTB["sales-" + str(createGameID(title)) + "___" + str(setPlatformID(platform)) + "___" + "na"])
                    g.add((Game, HLTB["sold"], GameSalesID))
                if pd.notna(game["jp_sales"].iloc[0]):
                    GameSalesID = URIRef(
                        HLTB["sales-" + str(createGameID(title)) + "___" + str(setPlatformID(platform)) + "___" + "jp"])
                    g.add((Game, HLTB["sold"], GameSalesID))
                if pd.notna(game["other_sales"].iloc[0]):
                    GameSalesID = URIRef(HLTB["sales-" + str(createGameID(title)) + "___" + str(
                        setPlatformID(platform)) + "___" + "other"])
                    g.add((Game, HLTB["sold"], GameSalesID))
                if pd.notna(game["total_shipped"].iloc[0]):
                    GameSalesID = URIRef(HLTB["sales-" + str(createGameID(title)) + "___" + str(
                        setPlatformID(platform)) + "___" + "global"])
                    g.add((Game, HLTB["sold"], GameSalesID))

In [57]:
# Save the data in the Turtle format
with open(gamesTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved games TTL file.")

Saved games TTL file.


### Genre

Now serializing the Genre class

In [58]:
# Create Graph
g = createGraph()

# Load the CSV files in memory
genres = pd.read_csv(gamesPath, sep=",", index_col="genres")

In [59]:
for genre, row in genres.iterrows():
    allGenres = setGenreID(genre)
    for iterator in allGenres:
        if not (iterator[0] == " " or iterator[0] == ""):
            Genre = URIRef(HLTB[iterator[1]])
            #Add triples using store's add() method.
            g.add((Genre, RDF.type, HLTB.Genre))
            # Add the name of the genre
            g.add((Genre, HLTB["name"], Literal(iterator[0], datatype=XSD.string)))

In [60]:
# Save genre data in the Turtle format
with open(genresTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

### Platforms

In [61]:
# Create Graph
g = createGraph()
# Load the CSV files in memory
platforms = pd.read_csv(platformsPath, sep=",", index_col="Platform")

In [62]:
manufacturerDict = {}

for platform, row in platforms.iterrows():
    Platform = URIRef(HLTB[setPlatformID(platform)])
    #Add triples using store's add() method.
    g.add((Platform, RDF.type, HLTB.Platform))
    # Add the name of the genre
    g.add((Platform, HLTB["name"], Literal(platform, datatype=XSD.string)))

    #Add popularity if platform is popular
    if row["Popular"]:
        g.add((Platform, HLTB["popular"], Literal(True, datatype=XSD.boolean)))

    #Add release date if present
    if pd.notna(row["Release date"]):
        time = datetime.combine(datetime.strptime(row["Release date"], '%Y-%M-%d'), datetime.min.time())
        g.add((Platform, HLTB["releaseDate"], Literal(time, datatype=XSD.dateTime)))

    #Add CPU information
    if pd.notna(row["CPU"]):
        g.add((Platform, HLTB["cpu"], Literal(row["CPU"], datatype=XSD.string)))

    #Add CPU bit information
    if pd.notna(row["\"Bits\""]):
        bits = row["\"Bits\""].split("-")[0]
        g.add((Platform, HLTB["bits"], Literal(bits, datatype=XSD.int)))

    #Add acronym information
    if pd.notna(row["Acronym"]):
        g.add((Platform, HLTB["acronym"], Literal(row["Acronym"], datatype=XSD.string)))

    # add the manufacturer name
    if pd.notna(row["Manufacturer"]) or pd.notnull(row["Manufacturer"]):
        manufacturerStr = row["Manufacturer"].strip()
        if ',' in manufacturerStr:
            manufacturerStr = manufacturerStr.replace(',', '/')
        manufacturerSplit = manufacturerStr.split('/')
        for elem in manufacturerSplit:
            elem = elem.strip()
            manufacturerName = ''
            manufacturerCountry = ''
            if '(' in elem:
                elemSplit = elem.split('(')
                manufacturerName = elemSplit[0].strip()
                manufacturerCountry = elemSplit[1].strip()
                if not manufacturerName[-1].isalnum():
                    manufacturerName = manufacturerName[:-1]
                if manufacturerCountry[-1] == ')':
                    manufacturerCountry = manufacturerCountry[:-1]
                manufacturerCountry = ''.join(c for c in manufacturerCountry if c.isalnum() or c == ' ')
            else:
                manufacturerName = elem
                if manufacturerName == 'Panasonic' or manufacturerName == 'Sega':
                    manufacturerCountry = 'Japan'

            manufacturerDict[manufacturerName] = manufacturerCountry
            #print(manufacturerCountry)
            if manufacturerName != '':
                manufacturer = URIRef(HLTB[setCompanyID(manufacturerName)])
                g.add((Platform, HLTB["createdBy"], manufacturer))

In [63]:
# Save data in the Turtle format
with open(platformsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

### Stats


In [64]:
g = createGraph()
games = pd.read_csv(gamesPath, sep=",")
vgchartz = pd.read_csv(vgchartzPath, sep=",")
completionTime = pd.read_csv(completionTimePath, sep=",")
platforms = pd.read_csv(platformsPath, sep=",")

merged_temp = pd.merge(games, completionTime, left_on='id',
                       right_on='gameID')
merged_temp1 = pd.merge(vgchartz, platforms, left_on='console', right_on='Acronym', how='left')
merged = pd.merge(merged_temp, merged_temp1, left_on=["title", "platform"], right_on=["title", "Platform"], how='left')

In [65]:
for id, row in merged.iterrows():
    if pd.notna(row["title"]) and pd.notna(row["platform"]):
        StatsID = URIRef(HLTB[createGameID("stats-" + str(createGameID(row["title"]))) + "___" + str(
            setPlatformID(row["platform"]))])

        #Adding node type
        g.add((StatsID, HLTB.Type, HLTB.Stats))

        #Add Time information
        g.add((StatsID, HLTB["polledTime"], Literal(row["count_comp"], datatype=XSD.int)))
        g.add((StatsID, HLTB["mainTime"], Literal(row["comp_main"], datatype=XSD.int)))
        g.add((StatsID, HLTB["mainPlusTime"], Literal(row["comp_plus"], datatype=XSD.int)))
        g.add((StatsID, HLTB["completionistTime"], Literal(row["comp_100"], datatype=XSD.int)))
        g.add((StatsID, HLTB["slowestTime"], Literal(row["comp_low"], datatype=XSD.int)))
        g.add((StatsID, HLTB["fastestTime"], Literal(row["comp_high"], datatype=XSD.int)))

        #Add remaining stats
        if pd.notna(row["critic_score"]):
            g.add((StatsID, HLTB["criticScore"], Literal(row["critic_score"], datatype=XSD.float)))
        if pd.notna(row["user_score"]):
            g.add((StatsID, HLTB["userScore"], Literal(row["user_score"], datatype=XSD.float)))

        if pd.notna(row["release_date"]):
            time = datetime.combine(datetime.strptime(row["release_date"], '%Y-%M-%d'), datetime.min.time())
            g.add((StatsID, HLTB["releaseDate"], Literal(time, datatype=XSD.dateTime)))

        #Add "onPlatform" object property
        g.add((StatsID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["platform"])])))

In [66]:

# Save the data in the Turtle format
with open(statsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved company TTL file.")

Saved company TTL file.


### Game sales

In [67]:
g = createGraph()
vgchartz = pd.read_csv(vgchartzPath, sep=",")
platforms = pd.read_csv(platformsPath, sep=",")

merged = pd.merge(vgchartz, platforms, left_on='console', right_on='Acronym', how='left')
merged.to_csv("prova.csv")

In [68]:
for id, row in merged.iterrows():
    if pd.notna(row["Platform"]):

        if pd.notna(row["pal_sales"]):
            GameSalesID = URIRef(HLTB["sales-" + str(createGameID(row["title"])) + "___" + str(
                setPlatformID(row["Platform"])) + "___" + "eu"])
            g.add((GameSalesID, HLTB.Type, HLTB.Sale))
            g.add((GameSalesID, HLTB["unitsSold"], Literal(row["pal_sales"], datatype=XSD.float)))
            g.add((GameSalesID, HLTB["locatedIn"], URIRef(HLTB["eu"])))
            g.add((GameSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["jp_sales"]):
            GameSalesID = URIRef(HLTB["sales-" + str(createGameID(row["title"])) + "___" + str(
                setPlatformID(row["Platform"])) + "___" + "jp"])
            g.add((GameSalesID, HLTB.Type, HLTB.Sale))
            g.add((GameSalesID, HLTB["unitsSold"], Literal(row["jp_sales"], datatype=XSD.float)))
            g.add((GameSalesID, HLTB["locatedIn"], URIRef(HLTB["jp"])))
            g.add((GameSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["na_sales"]):
            GameSalesID = URIRef(HLTB["sales-" + str(createGameID(row["title"])) + "___" + str(
                setPlatformID(row["Platform"])) + "___" + "na"])
            g.add((GameSalesID, HLTB.Type, HLTB.Sale))
            g.add((GameSalesID, HLTB["unitsSold"], Literal(row["na_sales"], datatype=XSD.float)))
            g.add((GameSalesID, HLTB["locatedIn"], URIRef(HLTB["na"])))
            g.add((GameSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["other_sales"]):
            GameSalesID = URIRef(HLTB["sales-" + str(createGameID(row["title"])) + "___" + str(
                setPlatformID(row["Platform"])) + "___" + "other"])
            g.add((GameSalesID, HLTB.Type, HLTB.Sale))
            g.add((GameSalesID, HLTB["unitsSold"], Literal(row["other_sales"], datatype=XSD.float)))
            g.add((GameSalesID, HLTB["locatedIn"], URIRef(HLTB["other"])))
            g.add((GameSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["total_shipped"]):
            GameSalesID = URIRef(HLTB["sales-" + str(createGameID(row["title"])) + "___" + str(
                setPlatformID(row["Platform"])) + "___" + "global"])
            g.add((GameSalesID, HLTB.Type, HLTB.Sale))
            g.add((GameSalesID, HLTB["unitsSold"], Literal(row["total_shipped"], datatype=XSD.float)))
            g.add((GameSalesID, HLTB["locatedIn"], URIRef(HLTB["global"])))
            g.add((GameSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

In [69]:
# Save the data in the Turtle format
with open(gameSalesTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved game-sales TTL file.")

Saved game-sales TTL file.


### Company

In [70]:
# Create Graph
g = createGraph()
# Load the CSV files in memory

indiegamesdevelopers = pd.read_csv(indiegamesdevelopersPath, sep=",")

In [71]:
#companies=[]
#replace space with -, lower case
indiegamesdevelopers['companyID'] = setDeveloperID(indiegamesdevelopers['Developer'])
#indiegamesdevelopers['companyID'] = setCompanyID(indiegamesdevelopers['Developer'])


#indiegamesdevelopers.info()
for index, row in indiegamesdevelopers.iterrows():
    # Create the node to add to the Graph
    Company = URIRef(HLTB[row['companyID']])
    #companies.append(Company)

    # Add triples using store's add() method.
    g.add((Company, RDF.type, HLTB.Company))

    # Add the Company
    g.add((Company, HLTB["indieDeveloper"], Literal(pd.notnull(row['Developer']), datatype=XSD.boolean)))
    g.add((Company, HLTB['officialName'], Literal(row['Developer'], datatype=XSD.string)))


### video-game-developers

In [72]:
videoGameDevelopers = pd.read_csv(videoGameDevelopersPath, sep=",")

In [73]:
#replace space with -, lower case
videoGameDevelopers['companyID'] = setDeveloperID(videoGameDevelopers['Developer'])
#videoGameDevelopers['companyID'] = setCompanyID(videoGameDevelopers['Developer'])


#indiegamesdevelopers.info()
for index, row in videoGameDevelopers.iterrows():
    # Create the node to add to the Graph
    Company = URIRef(HLTB[row['companyID']])
    #companies.append(Company)

    # Add triples using store's add() method.
    g.add((Company, RDF.type, HLTB.Company))

    # Add the Company
    g.add((Company, HLTB['officialName'], Literal(row['Developer'], datatype=XSD.string)))


In [74]:
for manufacturer, country in manufacturerDict.items():
    manufacturerURI = URIRef(HLTB[setCompanyID(manufacturer)])
    g.add((manufacturerURI, RDF.type, HLTB.Company))
    g.add((manufacturerURI, HLTB['officialName'], Literal(manufacturer, datatype=XSD.string)))
    # add country of that manufacturer
    print(country)
    #g.add((manufacturerURI, HLTB['basedIn'], )))

Japan
Japan
South Korea
Canada
US
US
US
Japan
Japan
US
Japan
US
Netherlands
Japan
Brazil
Japan
US
US


In [75]:
# Save the data in the Turtle format
with open(companyTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved company TTL file.")

Saved company TTL file.


Region

In [76]:
# Create Graph
g = createGraph()

In [77]:
# Create the node to add to the Graph
Region = URIRef(HLTB['eu'])
g.add((Region, RDF.type, HLTB.Region))
g.add((Region, HLTB[{'eu', 'jp', 'na', 'other', 'global'}], Literal('eu', datatype=XSD.string)))

Region = URIRef(HLTB['jp'])
g.add((Region, RDF.type, HLTB.Region))
g.add((Region, HLTB[{'eu', 'jp', 'na', 'other', 'global'}], Literal('jp', datatype=XSD.string)))

Region = URIRef(HLTB['na'])
g.add((Region, RDF.type, HLTB.Region))
g.add((Region, HLTB[{'eu', 'jp', 'na', 'other', 'global'}], Literal('na', datatype=XSD.string)))

Region = URIRef(HLTB['other'])
g.add((Region, RDF.type, HLTB.Region))
g.add((Region, HLTB[{'eu', 'jp', 'na', 'other', 'global'}], Literal('other', datatype=XSD.string)))

Region = URIRef(HLTB['global'])
g.add((Region, RDF.type, HLTB.Region))
g.add((Region, HLTB[{'eu', 'jp', 'na', 'other', 'global'}], Literal('global', datatype=XSD.string)))


<Graph identifier=N98c25f759f0947e0bb07665ebaeea017 (<class 'rdflib.graph.Graph'>)>

In [78]:
# Save the data in the Turtle format
with open(regionsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved Regions TTL file.")

Saved Regions TTL file.


Country

In [122]:
for val in manufacturerDict.values():
    print(val, getCountry2Digits(val))

Japan name
Japan    JP
Name: alpha-2, dtype: object
Japan name
Japan    JP
Name: alpha-2, dtype: object
South Korea 
Canada name
Canada    CA
Name: alpha-2, dtype: object
US 
US 
US 
Japan name
Japan    JP
Name: alpha-2, dtype: object
Japan name
Japan    JP
Name: alpha-2, dtype: object
US 
Japan name
Japan    JP
Name: alpha-2, dtype: object
US 
Netherlands name
Netherlands    NL
Name: alpha-2, dtype: object
Japan name
Japan    JP
Name: alpha-2, dtype: object
Brazil name
Brazil    BR
Name: alpha-2, dtype: object
Japan name
Japan    JP
Name: alpha-2, dtype: object
US 
US 


In [80]:
countries = pd.read_csv(countriesPath, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

rawCountriesRegions = pd.read_csv(rawCountriesRegionsPath, sep=",", index_col='name')

g = createGraph()

In [81]:
#use rawCountriesRegions to get all countries and save ttl file



count1 = 0
count2 = 0
countriesNotFound=list()
for index, row in countries.iterrows():  # iterate countriesregions
    countryName = index.strip()
    #print(countryName)

    if countryName.lower() not in rawCountriesRegions.index.str.lower():
        count1 += 1
        countriesNotFound.append(countryName)

    for elem in rawCountriesRegions.index:
        elem = elem.strip()
        if elem.lower() == countryName.lower():
            count2 += 1
        #else:
        #if countryName not in countriesNotFound:
        #countriesNotFound.append(countryName)


    #if (countryName == countriesregions.index).any():

    #if countryName not in countriesregions.index:
    #print(countryName)
#for elem in countriesregions.index:
#        if elem == countryName:
#            print(elem, '__aaaaaaa')
#else:
#    res = countriesregions.index.str.contains(countryName)
#    print(countryName, '____ciao')

#search inside countriesregions
#for countriesregionsName in countriesregions.index:
#if countryName in countriesregionsName:
#print(countryName, ' ____')
#print(' ')
#else:
#print(countryName)
#print((countriesregions.index==countryName).any())
print(countriesNotFound)
print('Fist count: ', count1)
print('Second count: ', count2)

['Bolivia', 'Cape Verde', 'Congo, the Democratic Republic of the', 'Czech Republic', 'Holy See (Vatican City State)', 'Iran', "Korea, Democratic People's Republic of", 'Korea, Republic of (South Korea)', 'Libyan Arab Jamahiriya', 'Macedonia, the former Yugoslav Republic of', 'Micronesia, Federated States of', 'Netherlands Antilles', 'Palestinian Territory, Occupied', 'Reunion', 'Russia', 'Saint Martin', 'Swaziland', 'Taiwan', 'Turks and Caicos', 'United Kingdom', 'Venezuela', 'Vietnam', 'Virgin Islands, British', 'Virgin Islands, U.S.']
Fist count:  24
Second count:  222


In [None]:
#countriesregions['countryID'] = setCountryID(countriesregions['Country'])

#countriesregions.info()
for index, row in rawCountriesRegions.iterrows():

    for c in countries.index:
        cName = c.strip()

        if (countries.index == cName).any():
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            #Country = URIRef(CNS[row['countryID']])
            #g.add((Country, RDF.type, HLTB.Country))
            #g.add((Country, HLTB['subClassOf'], Literal(row['Region'], datatype=XSD.string)))

        g.add((HLTB[code], HLTB['subClassOf'], HLTB[row['Region']]))

In [None]:
# Load the CSV files in memory
countries = pd.read_csv(countriesPath, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])
countriesregions = pd.read_csv(countriesRegionsPath, sep=",", index_col='Country', names=['Country', 'Region'])

rawCountriesRegions = pd.read_csv(rawCountriesRegionsPath, sep=",", index_col='name', names=['Country', 'Region'])

g = createGraph()

In [None]:
countriesregions['countryID'] = setCountryID(countriesregions['Country'])

#countriesregions.info()
for index, row in countriesregions.iterrows():

    for c in countries.index:
        cName = c.strip()

        if (countries.index == cName).any():
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            #Country = URIRef(CNS[row['countryID']])
            #g.add((Country, RDF.type, HLTB.Country))
            #g.add((Country, HLTB['subClassOf'], Literal(row['Region'], datatype=XSD.string)))

        g.add((HLTB[code], HLTB['subClassOf'], HLTB[row['Region']]))

In [None]:
count1 = 0
count2 = 0
countriesNotFound=list()
for index, row in countries.iterrows():  # iterate countriesregions
    countryName = index.strip()
    #print(countryName)

    if countryName.lower() not in countriesregions.index.str.lower():
        count1 += 1
        countriesNotFound.append(countryName)

    for elem in countriesregions.index:
        elem = elem.strip()
        if elem.lower() == countryName.lower():
            count2 += 1
        #else:
            #if countryName not in countriesNotFound:
                #countriesNotFound.append(countryName)


    #if (countryName == countriesregions.index).any():

    #if countryName not in countriesregions.index:
        #print(countryName)
#for elem in countriesregions.index:
#        if elem == countryName:
#            print(elem, '__aaaaaaa')
#else:
#    res = countriesregions.index.str.contains(countryName)
#    print(countryName, '____ciao')

#search inside countriesregions
#for countriesregionsName in countriesregions.index:
#if countryName in countriesregionsName:
#print(countryName, ' ____')
#print(' ')
#else:
#print(countryName)
#print((countriesregions.index==countryName).any())
print(countriesNotFound)
print('Fist count: ', count1)
print('Second count: ', count2)

In [None]:
for index, row in countries.iterrows():  # iterate countriesregions
    countryName = index.strip()
    #print(countryName)
    if (countryName == countriesregions.index).any():
        code = str(countries[countryName == countriesregions.index]['Alpha-2 code']).lower()
        #print(code)
        #print(code)
        #Country = URIRef(CNS[row['countryID']])
        #g.add((Country, RDF.type, HLTB.Country))
        g.add((HLTB[code], HLTB['subClassOf'], HLTB[row['Region']]))
    else:
        print('ciao')

In [None]:
countriesregions['countryID'] = setCountryID(countriesregions['Country'])
#print(countries.index)
#countriesregions.info()
for index, row in countriesregions.iterrows():  # iterate countriesregions
    countryName = row['Country'].strip()
    #print(countryName)
    if countryName in countries.index:
        code = str(countries[countries.index == countryName]['Alpha-2 code'][0]).lower()
        print(code)
        #print(code)
        #Country = URIRef(CNS[row['countryID']])
        #g.add((Country, RDF.type, HLTB.Country))
        g.add((HLTB[code], HLTB['subClassOf'], HLTB[row['Region']]))

In [None]:
for c in countryName.split(','):  # iterate country
    #print(index, c)
    cName = c.strip()
    #print(cName)
    countryRegExToFind = '*[a-zA-Z].* .' + cName + '.* .*[a-zA-Z]'
    #countryRegExToFind=countryRegExToFind.join(cName)
    #countryRegExToFind=countryRegExToFind.join('.]')
    #print(countryRegExToFind)
    if cName in countries.index:
        code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
        print(code)
        #print(code)
        #Country = URIRef(CNS[row['countryID']])
        #g.add((Country, RDF.type, HLTB.Country))
        g.add((HLTB[code], HLTB['subClassOf'], HLTB[row['Region']]))

In [None]:
countriesregions['countryID'] = setCountryID(countriesregions['Country'])

#countriesregions.info()
for index, row in countriesregions.iterrows():

    for c in str(row['Country']).split(','):
        cName = c.strip()

        if (countries.index == cName).any():
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            Country = URIRef(CNS[row['countryID']])
            g.add((Country, RDF.type, HLTB.Country))
            g.add((Country, HLTB['subClassOf'], Literal(row['Region'], datatype=XSD.string)))

In [None]:
# Save the data in the Turtle format
with open(countriesTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved Regions TTL file.")

### Platform sales

In [None]:
g = createGraph()
platforms = pd.read_csv(platformsPath, sep=",")


In [None]:
for id, row in platforms.iterrows():
    if pd.notna(row["Platform"]):

        if pd.notna(row["Europe"]):
            PlatformSalesID = URIRef(HLTB["sales-" + str(setPlatformID(row["Platform"])) + "___" + "eu"])
            g.add((PlatformSalesID, HLTB.Type, HLTB.Sale))
            g.add((PlatformSalesID, HLTB["unitsSold"], Literal(row["Europe"], datatype=XSD.float)))
            g.add((PlatformSalesID, HLTB["locatedIn"], URIRef(HLTB["eu"])))
            g.add((PlatformSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["Japan"]):
            PlatformSalesID = URIRef(HLTB["sales-" + str(setPlatformID(row["Platform"])) + "___" + "jp"])

            g.add((PlatformSalesID, HLTB.Type, HLTB.Sale))
            g.add((PlatformSalesID, HLTB["unitsSold"], Literal(row["Japan"], datatype=XSD.float)))
            g.add((PlatformSalesID, HLTB["locatedIn"], URIRef(HLTB["jp"])))
            g.add((PlatformSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["North America"]):
            PlatformSalesID = URIRef(HLTB["sales-" + str(setPlatformID(row["Platform"])) + "___" + "na"])

            g.add((PlatformSalesID, HLTB.Type, HLTB.Sale))
            g.add((PlatformSalesID, HLTB["unitsSold"], Literal(row["North America"], datatype=XSD.float)))
            g.add((PlatformSalesID, HLTB["locatedIn"], URIRef(HLTB["na"])))
            g.add((PlatformSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["Rest of World"]):
            PlatformSalesID = URIRef(HLTB["sales-" + str(setPlatformID(row["Platform"])) + "___" + "other"])

            g.add((PlatformSalesID, HLTB.Type, HLTB.Sale))
            g.add((PlatformSalesID, HLTB["unitsSold"], Literal(row["Rest of World"], datatype=XSD.float)))
            g.add((PlatformSalesID, HLTB["locatedIn"], URIRef(HLTB["other"])))
            g.add((PlatformSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

        if pd.notna(row["Global"]):
            PlatformSalesID = URIRef(HLTB["sales-" + str(setPlatformID(row["Platform"])) + "___" + "global"])
            g.add((PlatformSalesID, HLTB.Type, HLTB.Sale))
            g.add((PlatformSalesID, HLTB["unitsSold"], Literal(row["Global"], datatype=XSD.float)))
            g.add((PlatformSalesID, HLTB["locatedIn"], URIRef(HLTB["global"])))
            g.add((PlatformSalesID, HLTB["onPlatform"], URIRef(HLTB[setPlatformID(row["Platform"])])))

In [None]:
# Save the data in the Turtle format
with open(platformsSalesTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

print("Saved platforms Sales TTL file.")