# Finance Knowledge Graph creation


In [93]:
# imports
import pandas as pd
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import FOAF, DCTERMS, XSD, RDF, SDO, RDFS, OWL
from urllib.parse import quote
import regex as re

In [94]:
g = Graph()

# namespaces
FINANCE = Namespace("https://w3id.org/finance/")

COMPANY = Namespace("https://finance.yahoo.com/quote/")
SECTOR = Namespace("https://finance.yahoo.com/sectors/")
INDUSTRY = Namespace("https://finance.yahoo.com/sectors/industry/")

JUSTETF = Namespace("https://www.justetf.com/en/")
# https://www.justetf.com/en/etf-profile.html?isin=

PERSON = Namespace("https://schema.org/Person")  # jobTitle (https://schema.org/jobTitle), name

#schema = Namespace("http://schema.org/")
#organization = Namespace("https://schema.org/Organization")

#company = Namespace("https://dbpedia.org/ontology/Company")
#industry = Namespace("https://dbpedia.org/ontology/industry")
#sector = Namespace("https://dbpedia.org/ontology/sector")
# https://dbpedia.org/page/Stock


# bind prefixes
g.bind("finance", FINANCE)
g.bind("company", COMPANY)
g.bind("sector", SECTOR)
g.bind("industry", INDUSTRY)
g.bind("justetf", JUSTETF)


# https://schema.org/Person
# https://schema.org/InvestmentFund
# https://dbpedia.org/page/Stock
# dbo:industy https://dbpedia.org/ontology/industry 

# https://schema.org/Organization

# Binding

In [95]:
# bind classes

g.add((FINANCE.Person, RDF.type, RDFS.Class))
g.add((FINANCE.InvestmentFund, RDF.type, RDFS.Class))
g.add((FINANCE.Company, RDF.type, RDFS.Class))
g.add((FINANCE.Industry, RDF.type, RDFS.Class))
g.add((FINANCE.Sector, RDF.type, RDFS.Class))
g.add((FINANCE.Job, RDF.type, RDFS.Class))
g.add((FINANCE.Address, RDF.type, RDFS.Class))
g.add((FINANCE.ContactInformation, RDF.type, RDFS.Class))
g.add((FINANCE.FinancialData, RDF.type, RDFS.Class))


<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [96]:
# bind properties

# investmendfund
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.InvestmentFund))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## company is in investment fund


# company
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## symbol
g.add((FINANCE.hasSymbol, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasSymbol, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasSymbol, RDFS.range, XSD.string))

# use schema.org employee as property for person in company
#https://schema.org/employee
## employee
g.add((FINANCE.hasEmployee, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasEmployee, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasEmployee, RDFS.range, FINANCE.Person))

## url or sameAs website for the company
g.add((FINANCE.hasWebsite, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasWebsite, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasWebsite, RDFS.range, XSD.string))

## industry
g.add((FINANCE.hasIndustry, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasIndustry, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasIndustry, RDFS.range, FINANCE.Industry))

# address
g.add((FINANCE.hasAddress, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasAddress, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasAddress, RDFS.range, FINANCE.Address))

# contact information
g.add((FINANCE.hasContactInformation, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasContactInformation, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasContactInformation, RDFS.range, FINANCE.ContactInformation))

# financial data
g.add((FINANCE.hasFinancialData, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasFinancialData, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasFinancialData, RDFS.range, FINANCE.FinancialData))



#g.add((finance.hasJob, RDF.type, OWL.ObjectProperty))

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [ ]:
# industry
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

In [97]:
# industry to sector
g.add((FINANCE.belongsToSector, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.belongsToSector, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.belongsToSector, RDFS.range, FINANCE.Sector))

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [98]:
# person
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## job title
g.add((FINANCE.hasJobTitle, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasJobTitle, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasJobTitle, RDFS.range, XSD.string))

## age
g.add((FINANCE.hasAge, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasAge, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasAge, RDFS.range, XSD.integer))

## yearborn
g.add((FINANCE.hasYearBorn, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasYearBorn, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasYearBorn, RDFS.range, XSD.integer))

## totalPay
g.add((FINANCE.hasTotalPay, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTotalPay, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasTotalPay, RDFS.range, XSD.float))

## exercisedValue
g.add((FINANCE.hasExercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasExercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasExercisedValue, RDFS.range, XSD.float))

## unexercisedValue
g.add((FINANCE.hasUnexercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasUnexercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasUnexercisedValue, RDFS.range, XSD.float))

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [99]:
# address
## street
g.add((FINANCE.hasStreet, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasStreet, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasStreet, RDFS.range, XSD.string))

## city
g.add((FINANCE.hasCity, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCity, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCity, RDFS.range, XSD.string))

## postal code
g.add((FINANCE.hasPostalCode, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasPostalCode, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasPostalCode, RDFS.range, XSD.string))

## country
g.add((FINANCE.hasCountry, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCountry, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCountry, RDFS.range, XSD.string))

## state
g.add((FINANCE.hasState, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasState, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasState, RDFS.range, XSD.string))

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [100]:
# contact information
## phone
#TODO put to company
g.add((FINANCE.hasPhone, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasPhone, RDFS.domain, FINANCE.ContactInformation))
g.add((FINANCE.hasPhone, RDFS.range, XSD.string))




<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [101]:
# safe ontology
g.serialize(destination='data/ontology.ttl', format='turtle')

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>

In [102]:
# financial data


In [103]:
# insert data
# company

# insert data

In [104]:
import yfinance as yf
import pandas as pd

In [105]:
def is_valid_ticker(ticker: yf):
    """
    This function checks if a given ticker symbol is valid.
    :param ticker: 
    :return: boolean
    """
    try:
        if str(ticker.get_info()) == "{'trailingPegRatio': None}":
            return False
        return True
    except Exception as e:
        # If an exception is raised, the ticker is invalid
        print(e)
        return False

In [106]:
# invalid ticker test
is_valid_ticker(yf.Ticker('ASDASDASd'))


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/ASDASDASD?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=ASDASDASD&crumb=BNLUo1Q1nAD


False

In [107]:
def grep_ticker_data(tickers: pd.Series):
    """
    This function accumulates relevant stock data from yahoo finance with the corresponding ticker symbol.
    :param tickers: column of ticker symbols
    :return: TODO
    """
    i = 0  # TODO remove
    invalid_tickers = []

    # get possible info data
    tickers_metadata_df = pd.DataFrame()  #columns=yf.Ticker('').get_fast_info())
    for ticker_value in tickers:
        ticker = yf.Ticker(str(ticker_value))

        if not is_valid_ticker(ticker):
            invalid_tickers.append(ticker_value)
            continue

        # deprecated
        # tickers_metadata_df.append(ticker.get_info(), ignore_index=True)

        tickers_metadata_df = pd.concat([tickers_metadata_df, pd.DataFrame(ticker.get_info())])

        #tickers_metadata_df.loc[str(ticker_value)] = new_row_dict
        # for info in ticker.get_fast_info():
        #     #info_data_dict = yf.Ticker(str(info)).get_info()
        #     tickers_metadata_df.append(info, ignore_index=True)

        # TODO remove
        i = i + 1
        if i == 2:
            break

    print(f"a total of {len(invalid_tickers)} invalid tickers. \n"
          f"{invalid_tickers}")

    return tickers_metadata_df

In [108]:
sectors_industries_df = pd.read_excel('data/sectors-industries.xlsx')
sectors_industries_df

Unnamed: 0,Technology,Financial Services,Healthcare,Consumer Cyclical,Industrials,Communication Services,Consumer Defensive,Energy,Basic Materials,Real Estate,Utilities
0,Software - Infrastructure,Bank - Diversified,Drug Manufacturers - General,Internet Retail,Specialty Industrial Machinery,Internet Content & Information,Discount Stores,Oil & Gas Integrated,Specialty Chemicals,REIT - Specialty,Utilities - Regulated Electric
1,Semiconductors,Credit Services,Healthcare Plans,Auto Manufacturers,Aerospace & Defense,Telecom Services,Beverages - Non-Alcoholic,Oil & Gas E&P,Gold,REIT - Industrial,Utilities - Renewable
2,Consumer Electronics,Asset Management,Medical Devices,Restaurants,Railroads,Entertainment,Household & Personal Products,Oil & Gas Midstream,Building Materials,REIT - Retail,Utilities - Diversified
3,Software - Application,Banks - Regional,Biotechnology,Home Improvement Retail,Farm & Heavy Construction Machinery,Electronic Gaming & Multimedia,Packaged Foods,Oil & Gas Refining & Marketing,Copper,REIT - Residential,Utilities - Regulated Gas
4,Information Technology Services,Capital Markets,Diagnostics & Research,Travel Services,Building Products & Equipment,Advertising Agencies,Tobacco,Oil & Gas Equipment & Services,Steel,Real Estate Services,Utilities - Regulated Water
5,Semiconductor Equipment & Materials,Insurance - Diversified,Medical Instruments & Supplies,Specialty Retail,Specialty Business Services,Broadcasting,Confectioners,Uranium,Agricultural Inputs,REIT - Healthcare Facilities,Utilities - Independent Power Producers
6,Computer Hardware,Financial Data & Stock Exchanges,Medical Care Facilities,Apparel Retail,Integrated Freight & Logistics,Publishing,Beverages - Wineries & Distilleries,Oil & Gas Drilling,Chemicals,REIT - Office,
7,Communication Equipment,Insurance Brokers,Drug Manufacturers - Specialty & Generic,Residential Construction,Waste Management,,Farm Products,Thermal Coal,Other Industrial Metals & Mining,REIT - Diversified,
8,Electronic Components,Insurance - Property & Casualty,Medical Distribution,Footwear & Accessories,Industrial Distribution,,Food Distribution,,Lumber & Wood Production,REIT - Mortgage,
9,Scientific & Technical Instruments,Insurance - Life,Health Information Services,Auto Parts,Conglomerates,,Grocery Stores,,Coking Coal,REIT - Hotel & Motel,


In [109]:
def prep_uri(namespace: Namespace, clazz: str):
    #replace special chars like & with '' from the clazz
    return namespace[re.sub(r'\s+', '-', clazz.replace('&', '').replace('-', '').lower())]

In [110]:
# iterate thorugh sector_indsutries_df per column
for sector in sectors_industries_df.columns:

    sector_uri = prep_uri(SECTOR, sector)

    g.add((sector_uri, RDF.type, FINANCE.Sector))
    g.add((sector_uri, RDFS.label, Literal(sector, datatype=XSD.string)))

    for industry in sectors_industries_df[sector]:
        if pd.isna(industry):
            continue

        industry_uri = prep_uri(Namespace(sector_uri + '/'), industry)

        g.add((industry_uri, RDF.type, FINANCE.Industry))
        g.add((industry_uri, RDFS.label, Literal(industry, datatype=XSD.string)))
        g.add((industry_uri, FINANCE.belongsToSector, sector_uri))

In [111]:
# safe kg
g.serialize(destination='data/finance-kg.ttl', format='turtle')

<Graph identifier=N782a8039f6ff4e90886a5fb08b59d13a (<class 'rdflib.graph.Graph'>)>