# Finance Knowledge Graph creation


In [1]:
# imports
import pandas as pd
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import FOAF, DCTERMS, XSD, RDF, SDO, RDFS, OWL
from urllib.parse import quote

In [2]:
g = Graph()

# namespaces
FINANCE = Namespace("https://w3id.org/finance/")

COMPANY = Namespace("https://finance.yahoo.com/quote/")
SECTOR = Namespace("https://finance.yahoo.com/sectors/")
INDUSTRY = Namespace("https://finance.yahoo.com/sectors/industry/")

JUSTETF = Namespace("https://www.justetf.com/en/")
# https://www.justetf.com/en/etf-profile.html?isin=

PERSON = Namespace("https://schema.org/person/")  # jobTitle (https://schema.org/jobTitle), name

#schema = Namespace("http://schema.org/")
#organization = Namespace("https://schema.org/Organization")

#company = Namespace("https://dbpedia.org/ontology/Company")
#industry = Namespace("https://dbpedia.org/ontology/industry")
#sector = Namespace("https://dbpedia.org/ontology/sector")
# https://dbpedia.org/page/Stock


# bind prefixes
g.bind("finance", FINANCE)
g.bind("company", COMPANY)
g.bind("sector", SECTOR)
g.bind("industry", INDUSTRY)
g.bind("justetf", JUSTETF)


# https://schema.org/Person
# https://schema.org/InvestmentFund
# https://dbpedia.org/page/Stock
# dbo:industy https://dbpedia.org/ontology/industry 

# https://schema.org/Organization

# Binding

In [3]:
# bind classes

g.add((FINANCE.Person, RDF.type, RDFS.Class))
g.add((FINANCE.InvestmentFund, RDF.type, RDFS.Class))
g.add((FINANCE.Company, RDF.type, RDFS.Class))
g.add((FINANCE.Industry, RDF.type, RDFS.Class))
g.add((FINANCE.Sector, RDF.type, RDFS.Class))
g.add((FINANCE.Job, RDF.type, RDFS.Class))
g.add((FINANCE.Address, RDF.type, RDFS.Class))
g.add((FINANCE.ContactInformation, RDF.type, RDFS.Class))
g.add((FINANCE.FinancialData, RDF.type, RDFS.Class))


<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [4]:
# bind properties

# investmendfund
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.InvestmentFund))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## company is in investment fund


# company
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## symbol
g.add((FINANCE.hasTicker, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTicker, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasTicker, RDFS.range, XSD.string))

# use schema.org employee as property for person in company
#https://schema.org/employee
## employee
g.add((FINANCE.hasKeyExecutive, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasKeyExecutive, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasKeyExecutive, RDFS.range, FINANCE.Person))

## url or sameAs website for the company
g.add((FINANCE.hasWebsite, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasWebsite, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasWebsite, RDFS.range, XSD.string))

## industry
g.add((FINANCE.hasIndustry, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasIndustry, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasIndustry, RDFS.range, FINANCE.Industry))

# address
g.add((FINANCE.hasAddress, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasAddress, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasAddress, RDFS.range, FINANCE.Address))

# contact information
g.add((FINANCE.hasContactInformation, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasContactInformation, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasContactInformation, RDFS.range, FINANCE.ContactInformation))

# financial data
g.add((FINANCE.hasFinancialData, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasFinancialData, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasFinancialData, RDFS.range, FINANCE.FinancialData))



#g.add((finance.hasJob, RDF.type, OWL.ObjectProperty))

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [5]:
# industry
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [6]:
# industry to sector
g.add((FINANCE.belongsToSector, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.belongsToSector, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.belongsToSector, RDFS.range, FINANCE.Sector))

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [7]:
# person
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## gender
g.add((FINANCE.hasGender, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasGender, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasGender, RDFS.range, XSD.string))

## job title
g.add((FINANCE.hasJobTitle, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasJobTitle, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasJobTitle, RDFS.range, XSD.string))

## age
g.add((FINANCE.hasAge, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasAge, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasAge, RDFS.range, XSD.integer))

## yearborn
g.add((FINANCE.hasYearBorn, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasYearBorn, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasYearBorn, RDFS.range, XSD.integer))

## totalPay
g.add((FINANCE.hasTotalPay, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTotalPay, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasTotalPay, RDFS.range, XSD.float))

## exercisedValue
g.add((FINANCE.hasExercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasExercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasExercisedValue, RDFS.range, XSD.float))

## unexercisedValue
g.add((FINANCE.hasUnexercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasUnexercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasUnexercisedValue, RDFS.range, XSD.float))

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [8]:
# address
## street
g.add((FINANCE.hasStreet, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasStreet, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasStreet, RDFS.range, XSD.string))

## city
g.add((FINANCE.hasCity, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCity, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCity, RDFS.range, XSD.string))

## postal code
g.add((FINANCE.hasPostalCode, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasPostalCode, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasPostalCode, RDFS.range, XSD.string))

## country
g.add((FINANCE.hasCountry, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCountry, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCountry, RDFS.range, XSD.string))

## state
g.add((FINANCE.hasState, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasState, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasState, RDFS.range, XSD.string))

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [9]:
# contact information
## phone
#TODO put to company
g.add((FINANCE.hasPhone, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasPhone, RDFS.domain, FINANCE.ContactInformation))
g.add((FINANCE.hasPhone, RDFS.range, XSD.string))




<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

In [10]:
# safe ontology
g.serialize(destination='data/ontology.ttl', format='turtle')

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>

# insert data

In [11]:
import yfinance as yf
import pandas as pd
import regex as re

In [12]:
def prep_uri(namespace: Namespace, clazz: str):
    """
    This function prepares a URI for a given class. It removes special chars like & and -.
    After that it removes any amount of whitespaces and replaces them with a dash.
    :param namespace: 
    :param clazz: 
    :return: 
    """
    return namespace[quote(re.sub(r'\s+', '-', clazz.replace('&', '').replace('-', '').replace('.','').lower()))]

## industry and sector data

In [13]:
# read industry data
sectors_industries_df = pd.read_excel('data/sectors-industries.xlsx')
sectors_industries_df.head()

Unnamed: 0,Technology,Financial Services,Healthcare,Consumer Cyclical,Industrials,Communication Services,Consumer Defensive,Energy,Basic Materials,Real Estate,Utilities
0,Software - Infrastructure,Bank - Diversified,Drug Manufacturers - General,Internet Retail,Specialty Industrial Machinery,Internet Content & Information,Discount Stores,Oil & Gas Integrated,Specialty Chemicals,REIT - Specialty,Utilities - Regulated Electric
1,Semiconductors,Credit Services,Healthcare Plans,Auto Manufacturers,Aerospace & Defense,Telecom Services,Beverages - Non-Alcoholic,Oil & Gas E&P,Gold,REIT - Industrial,Utilities - Renewable
2,Consumer Electronics,Asset Management,Medical Devices,Restaurants,Railroads,Entertainment,Household & Personal Products,Oil & Gas Midstream,Building Materials,REIT - Retail,Utilities - Diversified
3,Software - Application,Banks - Regional,Biotechnology,Home Improvement Retail,Farm & Heavy Construction Machinery,Electronic Gaming & Multimedia,Packaged Foods,Oil & Gas Refining & Marketing,Copper,REIT - Residential,Utilities - Regulated Gas
4,Information Technology Services,Capital Markets,Diagnostics & Research,Travel Services,Building Products & Equipment,Advertising Agencies,Tobacco,Oil & Gas Equipment & Services,Steel,Real Estate Services,Utilities - Regulated Water


In [14]:
# create industry and sector triples
for sector in sectors_industries_df.columns:

    sector_uri = prep_uri(SECTOR, sector)

    g.add((sector_uri, RDF.type, FINANCE.Sector))
    g.add((sector_uri, RDFS.label, Literal(sector, datatype=XSD.string)))

    for industry in sectors_industries_df[sector]:
        if pd.isna(industry):
            continue

        industry_uri = prep_uri(Namespace(sector_uri + '/'), industry)

        g.add((industry_uri, RDF.type, FINANCE.Industry))
        g.add((industry_uri, RDFS.label, Literal(industry, datatype=XSD.string)))
        g.add((industry_uri, FINANCE.belongsToSector, sector_uri))

## company data

In [15]:
def is_valid_ticker(ticker: yf):
    """
    This function checks if a given ticker symbol is valid.
    :param ticker: 
    :return: boolean
    """
    try:
        if str(ticker.get_info()) == "{'trailingPegRatio': None}":
            return False
        return True
    except Exception as e:
        #print(e)
        return False

In [16]:
def grep_ticker_data(tickers: pd.Series):
    """
    This function accumulates relevant stock data from yahoo finance with the corresponding ticker symbol.
    :param tickers: column of ticker symbols
    :return: TODO
    """
    i = 0  # TODO remove
    invalid_tickers = []

    # get possible info data
    tickers_metadata_df = pd.DataFrame()  #columns=yf.Ticker('').get_fast_info())
    for ticker_value in tickers:
        ticker = yf.Ticker(str(ticker_value))

        if not is_valid_ticker(ticker):
            invalid_tickers.append(ticker_value)
            continue

        # deprecated
        # tickers_metadata_df.append(ticker.get_info(), ignore_index=True)

        #tickers_metadata_df = pd.concat([tickers_metadata_df, pd.DataFrame(ticker.get_info())])
        
        handle_info_data(ticker)
        
        # ticker.get_info()  is a dict get its keys
        #print(ticker.get_info().keys())

        #tickers_metadata_df.loc[str(ticker_value)] = new_row_dict
        # for info in ticker.get_fast_info():
        #     #info_data_dict = yf.Ticker(str(info)).get_info()
        #     tickers_metadata_df.append(info, ignore_index=True)

        # TODO remove
        # i = i + 1
        # if i == 2:
        #     break

    print(f"a total of {len(invalid_tickers)} invalid tickers. \n"
          f"{invalid_tickers}")

    return tickers_metadata_df

In [17]:
def handle_info_data(ticker: yf.Ticker):
    """
    This function handles the info data of a given ticker.
    :param ticker: 
    :return: 
    """
    ## handle persons
    info = ticker.get_info()
    # if shortname is not available, skip and print shortname
    if 'shortName' not in info:
        print(f"shortName not available for {ticker}")
        return
    company_uri = prep_uri(COMPANY, info['shortName'])
    
    g.add((company_uri, RDF.type, FINANCE.Company))
    g.add((company_uri, FINANCE.hasName, Literal(info['shortName'], datatype=XSD.string)))
    g.add((company_uri, FINANCE.hasTicker, Literal(info['symbol'], datatype=XSD.string),))
    g.add((company_uri, FINANCE.hasWebsite, Literal(info['website'], datatype=XSD.string)))
    
    
    
    for key in info.keys():
        if key == 'companyOfficers':
            for employee in info['companyOfficers']:
                name = re.sub(r'Mr. |Mrs. |Ms. ', '', employee['name'])
            
                person_uri = prep_uri(PERSON, name.lower().replace(' ', '-'))
                #print(person_uri)
                g.add((person_uri, RDF.type, FINANCE.Person))
                
                # remove Mr. and Mrs. from name
                g.add((person_uri, FINANCE.hasName, Literal(name, datatype=XSD.string)))
                # filter gender
                if 'Mr.' in employee['name']:
                    g.add((person_uri, FINANCE.hasGender, Literal('m', datatype=XSD.string)))
                if 'Mrs.' in employee['name'] or 'Ms.' in employee['name']:
                    g.add((person_uri, FINANCE.hasGender, Literal('w', datatype=XSD.string)))
                if 'title' in employee:
                    g.add((person_uri, FINANCE.hasJobTitle, Literal(employee['title'], datatype=XSD.string)))
                if 'totalPay' in employee:
                    g.add((person_uri, FINANCE.hasTotalPay, Literal(employee['totalPay'], datatype=XSD.float)))
                if 'exercisedValue' in employee:
                    g.add((person_uri, FINANCE.hasExercisedValue, Literal(employee['exercisedValue'], datatype=XSD.float)))
                if 'unexercisedValue' in employee:
                    g.add((person_uri, FINANCE.hasUnexercisedValue, Literal(employee['unexercisedValue'], datatype=XSD.float)))
                if 'yearBorn' in employee:
                    g.add((person_uri, FINANCE.hasYearBorn, Literal(employee['yearBorn'], datatype=XSD.integer)))
                if 'age' in employee:
                    g.add((person_uri, FINANCE.hasAge, Literal(employee['age'], datatype=XSD.integer)))
                # add employee to company
                g.add((company_uri, FINANCE.hasKeyExecutive, person_uri))
        
        # address
        # address_uri = prep_uri(FINANCE.Address, info['shortName'])
        # g.add((address_uri, RDF.type, FINANCE.Address))
        # if key == 'address1':
        #     g.add((address_uri, FINANCE.hasStreet, Literal(info['address1'], datatype=XSD.string)))
        # if key == 'city':
        #     g.add((address_uri, FINANCE.hasCity, Literal(info['city'], datatype=XSD.string)))
        # if key == 'zip':
        #     g.add((address_uri, FINANCE.hasPostalCode, Literal(info['zip'], datatype=XSD.string)))
        # if key == 'country':
        #     g.add((address_uri, FINANCE.hasCountry, Literal(info['country'], datatype=XSD.string)))
        # if key == 'state':
        #     g.add((address_uri, FINANCE.hasState, Literal(info['state'], datatype=XSD.string)))
        
        
    pass

In [18]:
# read companies in the msci world
companies_df = pd.read_excel('data/msci-world-updated.xlsx')

In [19]:
yf.Ticker('NONOF').get_info()

{'address1': 'Novo Alle 1',
 'city': 'Bagsvaerd',
 'zip': '2880',
 'country': 'Denmark',
 'phone': '45 44 44 88 88',
 'website': 'https://www.novonordisk.com',
 'industry': 'Biotechnology',
 'sector': 'Healthcare',
 'longBusinessSummary': 'Novo Nordisk A/S, together with its subsidiaries, engages in the research and development, manufacture, and distribution of pharmaceutical products in Europe, the Middle East, Africa, Mainland China, Hong Kong, Taiwan, North America, and internationally. It operates in two segments, Diabetes and Obesity Care, and Rare Disease. The Diabetes and Obesity care segment provides products for diabetes, obesity, cardiovascular, and other emerging therapy areas. The Rare Disease segment offers products in the areas of rare blood disorders, rare endocrine disorders, and hormone replacement therapy. The company also provides insulin pens, growth hormone pens, and injection needles. In addition, it offers smart solutions for diabetes treatment, such as smart ins

In [20]:
yf.Ticker('AAPL').get_info()

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '408 996 1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'sector': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts. In addition, the company offers various services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized

In [21]:
grep_ticker_data(companies_df['Ticker'])

404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/BRK/B?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=BRK%2FB&crumb=BNLUo1Q1nAD
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/XTSLA?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=XTSLA&crumb=BNLUo1Q1nAD
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/BF/B?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=BF%2FB&crumb=BNLUo1Q1nAD
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/HEI/A?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=fin

shortName not available for yfinance.Ticker object <CRSLF>
shortName not available for yfinance.Ticker object <ASRRF>


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/UHAL/B?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=UHAL%2FB&crumb=BNLUo1Q1nAD


shortName not available for yfinance.Ticker object <KDXRF>
shortName not available for yfinance.Ticker object <LTSSF>
a total of 5 invalid tickers. 
['BRK/B', 'XTSLA', 'BF/B', 'HEI/A', 'UHAL/B']


managed to reduce from around 500 invalid tickers to less than 10 because of fetching tickers from a different API.

In [22]:
# write code an api request to get the data

In [23]:
# safe kg
g.serialize(destination='data/finance-kg.ttl', format='turtle')

<Graph identifier=N6b7fb8da321241adb2bf6bb22a103fb9 (<class 'rdflib.graph.Graph'>)>