# Finance Knowledge Graph creation


In [1]:
# imports
import pandas as pd
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import FOAF, DCTERMS, XSD, RDF, SDO, RDFS, OWL
from urllib.parse import quote
import numpy as np

In [2]:
g = Graph()

# namespaces
FINANCE = Namespace("https://w3id.org/finance/")

COMPANY = Namespace("https://finance.yahoo.com/quote/")
SECTOR = Namespace("https://finance.yahoo.com/sectors/")
#INDUSTRY = Namespace("https://finance.yahoo.com/sectors/industry/")

ETF = Namespace("https://w3id.org/finance/etf/")
# https://www.justetf.com/en/etf-profile.html?isin=

PERSON = Namespace("https://schema.org/person/")  # jobTitle (https://schema.org/jobTitle), name

FINANCIALDATA = Namespace("https://w3id.org/finance/financialdata/")
QUARTERLYFIGURES = Namespace("https://w3id.org/finance/financialdata/quarterlyfigures/")  # quarterly figures

#schema = Namespace("http://schema.org/")
#organization = Namespace("https://schema.org/Organization")

#company = Namespace("https://dbpedia.org/ontology/Company")
#industry = Namespace("https://dbpedia.org/ontology/industry")
#sector = Namespace("https://dbpedia.org/ontology/sector")
# https://dbpedia.org/page/Stock


# bind prefixes
g.bind("finance", FINANCE)
g.bind("company", COMPANY)
g.bind("sector", SECTOR)
#g.bind("industry", INDUSTRY)
g.bind("etf", ETF)
g.bind("person", PERSON)

g.bind("earningsdate", FINANCE.EarningDate)
g.bind("recommendation", FINANCE.Recommendation)

g.bind("financialdata", FINANCIALDATA)
g.bind("quarterlyfigures", QUARTERLYFIGURES)

In [3]:


# https://schema.org/Person
# https://schema.org/InvestmentFund
# https://dbpedia.org/page/Stock
# dbo:industy https://dbpedia.org/ontology/industry 

# https://schema.org/Organization

# Binding

## bind classes

In [4]:
# bind classes
g.add((FINANCE.Person, RDF.type, RDFS.Class))
g.add((FINANCE.ETF, RDF.type, RDFS.Class))
g.add((FINANCE.Company, RDF.type, RDFS.Class))
g.add((FINANCE.Industry, RDF.type, RDFS.Class))
g.add((FINANCE.Sector, RDF.type, RDFS.Class))
g.add((FINANCE.Job, RDF.type, RDFS.Class))
g.add((FINANCE.Address, RDF.type, RDFS.Class))
g.add((FINANCE.ContactInformation, RDF.type, RDFS.Class))
g.add((FINANCE.EarningsDate, RDF.type, RDFS.Class))
g.add((FINANCE.Recommendation, RDF.type, RDFS.Class))
# g.add((FINANCE.FinancialData, RDF.type, RDFS.Class))
# g.add((FINANCE.QuarterlyFigures, RDF.type, RDFS.Class))


<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

# bind properties
## investment fund

In [5]:
# bind properties

# etf
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.ETF))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

g.add((FINANCE.hasTicker, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTicker, RDFS.domain, FINANCE.ETF))
g.add((FINANCE.hasTicker, RDFS.range, XSD.string))

g.add((FINANCE.hasCurrency, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCurrency, RDFS.domain, FINANCE.ETF))
g.add((FINANCE.hasCurrency, RDFS.range, XSD.string))

# company
g.add((FINANCE.hasCompany, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasCompany, RDFS.domain, FINANCE.ETF))
g.add((FINANCE.hasCompany, RDFS.range, FINANCE.Company))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## company

In [6]:
# company
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## symbol
g.add((FINANCE.hasTicker, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTicker, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasTicker, RDFS.range, XSD.string))

# use schema.org employee as property for person in company
#https://schema.org/employee
## employee
g.add((FINANCE.hasKeyExecutive, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasKeyExecutive, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasKeyExecutive, RDFS.range, FINANCE.Person))

## url or sameAs website for the company
g.add((FINANCE.hasWebsite, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasWebsite, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasWebsite, RDFS.range, XSD.string))

## full time employees
g.add((FINANCE.hasFullTimeEmployees, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasFullTimeEmployees, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasFullTimeEmployees, RDFS.range, XSD.integer))

## industry
g.add((FINANCE.hasIndustry, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasIndustry, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasIndustry, RDFS.range, FINANCE.Industry))

## earnings date
g.add((FINANCE.hasEarningsDate, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasEarningsDate, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasEarningsDate, RDFS.range, FINANCE.EarningsDate))

# recommendation
g.add((FINANCE.hasRecommendation, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasRecommendation, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasRecommendation, RDFS.range, FINANCE.Recommendation))

# address
# TODO
g.add((FINANCE.hasAddress, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.hasAddress, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasAddress, RDFS.range, FINANCE.Address))

# currency
g.add((FINANCE.hasCurrency, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCurrency, RDFS.domain, FINANCE.Company))
g.add((FINANCE.hasCurrency, RDFS.range, XSD.string))

# # contact information
# # TODO
# g.add((FINANCE.hasContactInformation, RDF.type, OWL.ObjectProperty))
# g.add((FINANCE.hasContactInformation, RDFS.domain, FINANCE.Company))
# g.add((FINANCE.hasContactInformation, RDFS.range, FINANCE.ContactInformation))

# financial data
# g.add((FINANCE.hasFinancialData, RDF.type, OWL.ObjectProperty))
# g.add((FINANCE.hasFinancialData, RDFS.domain, FINANCE.Company))
# g.add((FINANCE.hasFinancialData, RDFS.range, FINANCE.FinancialData))

#g.add((finance.hasJob, RDF.type, OWL.ObjectProperty))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## earning date

In [7]:
# earnings date has  ['EPS Estimate', 'Reported EPS', 'Surprise(%)']
# EPS Estimate
g.add((FINANCE.hasEstimatedEPS, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasEstimatedEPS, RDFS.domain, FINANCE.EarningsDate))
g.add((FINANCE.hasEstimatedEPS, RDFS.range, XSD.float))

# Reported EPS
g.add((FINANCE.hasReportedEPS, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasReportedEPS, RDFS.domain, FINANCE.EarningsDate))
g.add((FINANCE.hasReportedEPS, RDFS.range, XSD.float))

# Surprise(%)
g.add((FINANCE.hasSurprise, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasSurprise, RDFS.domain, FINANCE.EarningsDate))
g.add((FINANCE.hasSurprise, RDFS.range, XSD.float))

# date
g.add((FINANCE.hasDate, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasDate, RDFS.domain, FINANCE.EarningsDate))
g.add((FINANCE.hasDate, RDFS.range, XSD.date))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## recommendation

In [8]:
# recommendation

# date
g.add((FINANCE.hasDate, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasDate, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasDate, RDFS.range, XSD.date))

# strongbuycount
g.add((FINANCE.hasStrongBuyCount, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasStrongBuyCount, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasStrongBuyCount, RDFS.range, XSD.integer))

# buycount
g.add((FINANCE.hasBuyCount, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasBuyCount, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasBuyCount, RDFS.range, XSD.integer))

# holdcount
g.add((FINANCE.hasHoldCount, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasHoldCount, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasHoldCount, RDFS.range, XSD.integer))

# sellcount
g.add((FINANCE.hasSellCount, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasSellCount, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasSellCount, RDFS.range, XSD.integer))

# strongsellcount
g.add((FINANCE.hasStrongSellCount, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasStrongSellCount, RDFS.domain, FINANCE.Recommendation))
g.add((FINANCE.hasStrongSellCount, RDFS.range, XSD.integer))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## financial data

In [9]:
# earnings date class has to be created



In [10]:
# quarterly figures
# g.add((FINANCE.hasQuarterlyFigures, RDF.type, OWL.ObjectProperty))
# g.add((FINANCE.hasQuarterlyFigures, RDFS.domain, FINANCE.FinancialData))
# g.add((FINANCE.hasQuarterlyFigures, RDFS.range, FINANCE.QuarterlyFigures))



## quarterly figures

In [11]:
# # net income
# g.add((FINANCE.hasNetIncome, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasNetIncome, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasNetIncome, RDFS.range, XSD.float))
# 
# # Net Income From Continuing Operations
# g.add((FINANCE.hasNetIncomeFromContinuingOperations, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasNetIncomeFromContinuingOperations, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasNetIncomeFromContinuingOperations, RDFS.range, XSD.float))
# 
# # total revenue
# g.add((FINANCE.hasTotalRevenue, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasTotalRevenue, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasTotalRevenue, RDFS.range, XSD.float))
# 
# # cost of revenue
# g.add((FINANCE.hasCostOfRevenue, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasCostOfRevenue, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasCostOfRevenue, RDFS.range, XSD.float))
# 
# # gross profit
# g.add((FINANCE.hasGrossProfit, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasGrossProfit, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasGrossProfit, RDFS.range, XSD.float))
# 
# # operating income
# g.add((FINANCE.hasOperatingIncome, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasOperatingIncome, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasOperatingIncome, RDFS.range, XSD.float))
# 
# # Operating Expenses (including Research and Development, Selling General and Administration)
# g.add((FINANCE.hasOperatingExpense, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasOperatingExpense, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasOperatingExpense, RDFS.range, XSD.float))
# 
# # pretax income
# g.add((FINANCE.hasPretaxIncome, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasPretaxIncome, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasPretaxIncome, RDFS.range, XSD.float))
# 
# # net income available to common shareholders
# g.add((FINANCE.hasNetIncomeAvailableToCommonStockholders, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasNetIncomeAvailableToCommonStockholders, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasNetIncomeAvailableToCommonStockholders, RDFS.range, XSD.float))
# 
# # Diluted EPS
# g.add((FINANCE.hasDilutedEPS, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasDilutedEPS, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasDilutedEPS, RDFS.range, XSD.float))
# 
# # Basic EPS
# g.add((FINANCE.hasBasicEPS, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasBasicEPS, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasBasicEPS, RDFS.range, XSD.float))
# 
# # diluted average shares
# g.add((FINANCE.hasDilutedAverageShares, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasDilutedAverageShares, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasDilutedAverageShares, RDFS.range, XSD.float))
# 
# # basic average shares
# g.add((FINANCE.hasBasicAverageShares, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasBasicAverageShares, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasBasicAverageShares, RDFS.range, XSD.float))
# 
# # date
# g.add((FINANCE.hasDate, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasDate, RDFS.domain, FINANCE.QuarterlyFigures))
# g.add((FINANCE.hasDate, RDFS.range, XSD.date))

## industry

In [12]:
# industry
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## sector

In [13]:
# industry to sector
g.add((FINANCE.belongsToSector, RDF.type, OWL.ObjectProperty))
g.add((FINANCE.belongsToSector, RDFS.domain, FINANCE.Industry))
g.add((FINANCE.belongsToSector, RDFS.range, FINANCE.Sector))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

## person

In [14]:
# person
## name
g.add((FINANCE.hasName, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasName, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasName, RDFS.range, XSD.string))

## gender
g.add((FINANCE.hasGender, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasGender, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasGender, RDFS.range, XSD.string))

## job title
g.add((FINANCE.hasJobTitle, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasJobTitle, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasJobTitle, RDFS.range, XSD.string))

## age
g.add((FINANCE.hasAge, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasAge, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasAge, RDFS.range, XSD.integer))

## yearborn
g.add((FINANCE.hasYearBorn, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasYearBorn, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasYearBorn, RDFS.range, XSD.integer))

## totalPay
g.add((FINANCE.hasTotalPay, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasTotalPay, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasTotalPay, RDFS.range, XSD.float))

## exercisedValue
g.add((FINANCE.hasExercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasExercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasExercisedValue, RDFS.range, XSD.float))

## unexercisedValue
g.add((FINANCE.hasUnexercisedValue, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasUnexercisedValue, RDFS.domain, FINANCE.Person))
g.add((FINANCE.hasUnexercisedValue, RDFS.range, XSD.float))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

In [15]:
# address
## street
g.add((FINANCE.hasStreet, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasStreet, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasStreet, RDFS.range, XSD.string))

## city
g.add((FINANCE.hasCity, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCity, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCity, RDFS.range, XSD.string))

## postal code
g.add((FINANCE.hasPostalCode, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasPostalCode, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasPostalCode, RDFS.range, XSD.string))

## country
g.add((FINANCE.hasCountry, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasCountry, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasCountry, RDFS.range, XSD.string))

## state
g.add((FINANCE.hasState, RDF.type, OWL.DatatypeProperty))
g.add((FINANCE.hasState, RDFS.domain, FINANCE.Address))
g.add((FINANCE.hasState, RDFS.range, XSD.string))

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

In [16]:
# contact information
## phone
# #TODO put to company
# g.add((FINANCE.hasPhone, RDF.type, OWL.DatatypeProperty))
# g.add((FINANCE.hasPhone, RDFS.domain, FINANCE.ContactInformation))
# g.add((FINANCE.hasPhone, RDFS.range, XSD.string))




In [17]:
# safe ontology
g.serialize(destination='data/ontology.ttl', format='turtle')

<Graph identifier=N569ccb52709b4607834e33db72a24389 (<class 'rdflib.graph.Graph'>)>

# insert data

In [18]:
import yfinance as yf
import pandas as pd
import regex as re

In [19]:
def prep_uri(namespace: Namespace, clazz: str):
    """
    This function prepares a URI for a given class. It removes special chars like & and -.
    After that it removes any amount of whitespaces and replaces them with a dash.
    :param namespace: 
    :param clazz: 
    :return: 
    """
    return namespace[quote(re.sub(r'\s+', '-', clazz.replace('&', '').replace('-', '').replace('.', '').lower()))]

In [20]:
# insert etf data


## industry and sector data

In [21]:
# read industry data
sectors_industries_df = pd.read_excel('data/sectors-industries.xlsx')
sectors_industries_df.head()

Unnamed: 0,Technology,Financial Services,Healthcare,Consumer Cyclical,Industrials,Communication Services,Consumer Defensive,Energy,Basic Materials,Real Estate,Utilities
0,Software - Infrastructure,Banks - Diversified,Drug Manufacturers - General,Internet Retail,Specialty Industrial Machinery,Internet Content & Information,Discount Stores,Oil & Gas Integrated,Specialty Chemicals,REIT - Specialty,Utilities - Regulated Electric
1,Semiconductors,Credit Services,Healthcare Plans,Auto Manufacturers,Aerospace & Defense,Telecom Services,Beverages - Non-Alcoholic,Oil & Gas E&P,Gold,REIT - Industrial,Utilities - Renewable
2,Consumer Electronics,Asset Management,Medical Devices,Restaurants,Railroads,Entertainment,Household & Personal Products,Oil & Gas Midstream,Building Materials,REIT - Retail,Utilities - Diversified
3,Software - Application,Banks - Regional,Biotechnology,Home Improvement Retail,Farm & Heavy Construction Machinery,Electronic Gaming & Multimedia,Packaged Foods,Oil & Gas Refining & Marketing,Copper,REIT - Residential,Utilities - Regulated Gas
4,Information Technology Services,Capital Markets,Diagnostics & Research,Travel Services,Building Products & Equipment,Advertising Agencies,Tobacco,Oil & Gas Equipment & Services,Steel,Real Estate Services,Utilities - Regulated Water


In [22]:
# create industry and sector triples

# create industry uri and add to dataframe
industry_uri_df = pd.DataFrame(columns=['industry', 'uri'])

for sector in sectors_industries_df.columns:

    sector_uri = prep_uri(SECTOR, sector)

    g.add((sector_uri, RDF.type, FINANCE.Sector))
    g.add((sector_uri, RDFS.label, Literal(sector, datatype=XSD.string)))

    for industry in sectors_industries_df[sector]:
        if pd.isna(industry):
            continue

        industry_uri = prep_uri(Namespace(sector_uri + '/'), industry)
        # add industry uri to dataframe
        row = pd.Series({'industry': industry, 'uri': industry_uri})
        industry_uri_df = pd.concat([industry_uri_df, pd.DataFrame([row], columns=row.index)]).reset_index(drop=True)

        g.add((industry_uri, RDF.type, FINANCE.Industry))
        g.add((industry_uri, RDFS.label, Literal(industry, datatype=XSD.string)))
        g.add((industry_uri, FINANCE.belongsToSector, sector_uri))

In [23]:
industry_uri_df

Unnamed: 0,industry,uri
0,Software - Infrastructure,https://finance.yahoo.com/sectors/technology/s...
1,Semiconductors,https://finance.yahoo.com/sectors/technology/s...
2,Consumer Electronics,https://finance.yahoo.com/sectors/technology/c...
3,Software - Application,https://finance.yahoo.com/sectors/technology/s...
4,Information Technology Services,https://finance.yahoo.com/sectors/technology/i...
...,...,...
140,Utilities - Renewable,https://finance.yahoo.com/sectors/utilities/ut...
141,Utilities - Diversified,https://finance.yahoo.com/sectors/utilities/ut...
142,Utilities - Regulated Gas,https://finance.yahoo.com/sectors/utilities/ut...
143,Utilities - Regulated Water,https://finance.yahoo.com/sectors/utilities/ut...


## company data

In [24]:
def is_valid_ticker(ticker: yf):
    """
    This function checks if a given ticker symbol is valid.
    :param ticker: 
    :return: boolean
    """
    try:
        if str(ticker.get_info()) == "{'trailingPegRatio': None}":
            return False
        return True
    except Exception as e:
        #print(e)
        return False

In [25]:
def grep_ticker_data(tickers: pd.Series, etf_uri: str):
    """
    This function accumulates relevant stock data from yahoo finance with the corresponding ticker symbol.
    :param tickers: column of ticker symbols
    :return: TODO
    """
    i = 0  # TODO remove
    invalid_tickers = []

    # get possible info data
    tickers_metadata_df = pd.DataFrame()  #columns=yf.Ticker('').get_fast_info())
    for ticker_value in tickers:
        ticker = yf.Ticker(str(ticker_value))

        if not is_valid_ticker(ticker):
            # handling ticker like 'BRK/B', 'BF/B', 'HEI/A', 'UHAL/B'
            if '/' in ticker_value:
                ticker = yf.Ticker(str(ticker_value.replace('/', '-')))
                if not is_valid_ticker(ticker):
                    invalid_tickers.append(ticker_value)
                    continue
            else:
                invalid_tickers.append(ticker_value)
                continue
        # deprecated
        # tickers_metadata_df.append(ticker.get_info(), ignore_index=True)

        #tickers_metadata_df = pd.concat([tickers_metadata_df, pd.DataFrame(ticker.get_info())])

        handle_info_data(ticker, etf_uri)

        # TODO remove
        # i = i + 1
        # if i == 2:
        #     break

    print(f"a total of {len(invalid_tickers)} invalid tickers. \n"
          f"{invalid_tickers}")

    return tickers_metadata_df

In [26]:
def handle_info_data(ticker: yf.Ticker, etf_uri: str):
    """
    This function handles the info data of a given ticker.
    :param ticker: 
    :return: 
    """
    info = ticker.get_info()
    # if shortname is not available, skip and print shortname
    if 'shortName' not in info:
        print(f"shortName not available for {ticker}")
        return
    company_uri = URIRef(COMPANY + quote(info['symbol']))
    g.add((etf_uri, FINANCE.hasCompany, company_uri))
    g.add((company_uri, RDF.type, FINANCE.Company))
    
    g.add((company_uri, FINANCE.hasName, Literal(info['shortName'], datatype=XSD.string)))
    g.add((company_uri, FINANCE.hasTicker, Literal(info['symbol'], datatype=XSD.string),))
    if 'website' in info:
        g.add((company_uri, FINANCE.hasWebsite, Literal(info['website'], datatype=XSD.string)))
    if 'industry' in info:
        if info['industry'] in industry_uri_df['industry'].values:
            g.add((company_uri, FINANCE.hasIndustry, industry_uri_df[industry_uri_df['industry'] == info['industry']]['uri'].values[0]))
    if 'fullTimeEmployees' in info:
        g.add((company_uri, FINANCE.hasFullTimeEmployees, Literal(info['fullTimeEmployees'], datatype=XSD.integer)))
    if 'currency' in info:
        g.add((company_uri, FINANCE.hasCurrency, Literal(info['currency'], datatype=XSD.string)))
    # add financial data to company
    

    # add earnings
    base_earnings_uri = company_uri + '/earnings/'
    try:
        earnings_df = ticker.earnings_dates
        if earnings_df is not None:
            for time in earnings_df.index:
                date = time.date()
                earnings_uri = base_earnings_uri + quote(str(date))
                g.add((company_uri, FINANCE.hasEarningsDate, earnings_uri))
                g.add((earnings_uri, RDF.type, FINANCE.EarningDate,))
                for key in earnings_df.loc[time].keys():
                    if key == 'EPS Estimate':
                        g.add((earnings_uri, FINANCE.hasEstimatedEPS, Literal(earnings_df.loc[time][key], datatype=XSD.float)))
                    if key == 'Reported EPS':
                        g.add((earnings_uri, FINANCE.hasReportedEPS, Literal(earnings_df.loc[time][key], datatype=XSD.float)))
                    if key == 'Surprise(%)':
                        g.add((earnings_uri, FINANCE.hasSurprise, Literal(earnings_df.loc[time][key], datatype=XSD.float)))
    except KeyError as e:
        print(f"skipping earnings_dates for {company_uri}")
        
        
    ## done adding earnings

    # add recommendation
    base_recommendation_uri = company_uri + '/recommendation/'
    recommendation_df = ticker.get_recommendations()
    for idx in recommendation_df.index:
        keys = recommendation_df.loc[idx].keys()
        if 'period' in keys:
            recommendation_uri = base_recommendation_uri + quote(str(idx))
            g.add((company_uri, FINANCE.hasRecommendation, recommendation_uri))
            g.add((recommendation_uri, RDF.type, FINANCE.Recommendation))
            if 'Strong Buy' in keys:
                g.add((recommendation_uri, FINANCE.hasStrongBuyCount, Literal(recommendation_df.loc[idx]['Strong Buy'], datatype=XSD.integer)))
            if 'Buy' in keys:
                g.add((recommendation_uri, FINANCE.hasBuyCount, Literal(recommendation_df.loc[idx]['Buy'], datatype=XSD.integer)))
            if 'Hold' in keys:
                g.add((recommendation_uri, FINANCE.hasHoldCount, Literal(recommendation_df.loc[idx]['Hold'], datatype=XSD.integer)))
            if 'Sell' in keys:
                g.add((recommendation_uri, FINANCE.hasSellCount, Literal(recommendation_df.loc[idx]['Sell'], datatype=XSD.integer)))
            if 'Strong Sell' in keys:
                g.add((recommendation_uri, FINANCE.hasStrongSellCount, Literal(recommendation_df.loc[idx]['Strong Sell'], datatype=XSD.integer)))
    ## done adding recommendation

    for key in info.keys():
        # company officers
        if key == 'companyOfficers':
            for employee in info['companyOfficers']:
                name = re.sub(r'Mr. |Mrs. |Ms. ', '', employee['name'])

                person_uri = prep_uri(PERSON, name.lower().replace(' ', '-'))
                g.add((person_uri, RDF.type, FINANCE.Person))

                # remove Mr. and Mrs. from name
                g.add((person_uri, FINANCE.hasName, Literal(name, datatype=XSD.string)))
                # filter gender
                if 'Mr.' in employee['name']:
                    g.add((person_uri, FINANCE.hasGender, Literal('m', datatype=XSD.string)))
                if 'Mrs.' in employee['name'] or 'Ms.' in employee['name']:
                    g.add((person_uri, FINANCE.hasGender, Literal('w', datatype=XSD.string)))
                if 'title' in employee:
                    g.add((person_uri, FINANCE.hasJobTitle, Literal(employee['title'], datatype=XSD.string)))
                if 'totalPay' in employee:
                    g.add((person_uri, FINANCE.hasTotalPay, Literal(employee['totalPay'], datatype=XSD.float)))
                if 'exercisedValue' in employee:
                    g.add((person_uri, FINANCE.hasExercisedValue,
                           Literal(employee['exercisedValue'], datatype=XSD.float)))
                if 'unexercisedValue' in employee:
                    g.add((person_uri, FINANCE.hasUnexercisedValue,
                           Literal(employee['unexercisedValue'], datatype=XSD.float)))
                if 'yearBorn' in employee:
                    g.add((person_uri, FINANCE.hasYearBorn, Literal(employee['yearBorn'], datatype=XSD.integer)))
                if 'age' in employee:
                    g.add((person_uri, FINANCE.hasAge, Literal(employee['age'], datatype=XSD.integer)))
                # add employee to company
                g.add((company_uri, FINANCE.hasKeyExecutive, person_uri))
        ## done adding company officers
        # address
        address_uri = company_uri + '/address/'
        g.add((company_uri, FINANCE.hasAddress, address_uri))
        g.add((address_uri, RDF.type, FINANCE.Address))
        if 'address1' in info:
            g.add((address_uri, FINANCE.hasStreet, Literal(info['address1'], datatype=XSD.string)))
        if 'city' in info:
            g.add((address_uri, FINANCE.hasCity, Literal(info['city'], datatype=XSD.string)))
        if 'zip' in info:
            g.add((address_uri, FINANCE.hasPostalCode, Literal(info['zip'], datatype=XSD.string)))
        if 'country' in info:
            g.add((address_uri, FINANCE.hasCountry, Literal(info['country'], datatype=XSD.string)))
        if 'state' in info:
            g.add((address_uri, FINANCE.hasState, Literal(info['state'], datatype=XSD.string)))
        # done adding address
pass

In [27]:
# # add quarterly figures
# quarterly_figures_df = ticker.quarterly_income_stmt
# if quarterly_figures_df is not None:
#     for time in quarterly_figures_df.columns:
#         date = time.date()
#         quarterly_figures_uri = financial_data_uri + quote(str(date))
# 
#         g.add((financial_data_uri, FINANCE.hasQuarterlyFigures, quarterly_figures_uri))
# 
#         g.add((quarterly_figures_uri, RDF.type, FINANCE.QuarterlyFigures))
#         g.add((quarterly_figures_uri, FINANCE.hasDate, Literal(date, datatype=XSD.date)))
# 
#         for key in quarterly_figures_df[time].keys():
#             if key == 'Net Income':
#                 g.add((quarterly_figures_uri, FINANCE.hasNetIncome, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Net Income From Continuing Operations':
#                 g.add((quarterly_figures_uri, FINANCE.hasNetIncomeFromContinuingOperations, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Total Revenue':
#                 g.add((quarterly_figures_uri, FINANCE.hasTotalRevenue, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Cost Of Revenue':
#                 g.add((quarterly_figures_uri, FINANCE.hasCostOfRevenue, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Gross Profit':
#                 g.add((quarterly_figures_uri, FINANCE.hasGrossProfit, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Operating Income':
#                 g.add((quarterly_figures_uri, FINANCE.hasOperatingIncome, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Operating Expense':
#                 g.add((quarterly_figures_uri, FINANCE.hasOperatingExpense, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Pretax Income':
#                 g.add((quarterly_figures_uri, FINANCE.hasPretaxIncome, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Tax Provision':
#                 g.add((quarterly_figures_uri, FINANCE.hasTaxProvision, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Net Income Common Stockholders':
#                 g.add((quarterly_figures_uri, FINANCE.hasNetIncomeAvailableToCommonStockholders, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Diluted EPS':
#                 g.add((quarterly_figures_uri, FINANCE.hasDilutedEPS, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Basic EPS':
#                 g.add((quarterly_figures_uri, FINANCE.hasBasicEPS, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Diluted Average Shares':
#                 g.add((quarterly_figures_uri, FINANCE.hasDilutedAverageShares, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
#             if key == 'Basic Average Shares':
#                 g.add((quarterly_figures_uri, FINANCE.hasBasicAverageShares, Literal(quarterly_figures_df[time][key], datatype=XSD.float)))
# ## done adding quarterly figures


In [28]:
# read companies in the msci world
msci_world_companies_df = pd.read_excel('data/msci-world-updated.xlsx')
sp_500_companies_df = pd.read_excel('data/sp-500-updated.xlsx')

In [29]:
def add_etf(ticker: str):
    """
    This function adds an ETF to the knowledge graph.
    :param ticker: 
    :return: 
    """
    etf = yf.Ticker(ticker)
    info = etf.get_info()
    etf_uri = prep_uri(ETF, info['longName'])
    g.add((etf_uri, RDF.type, FINANCE.ETF))
    g.add((etf_uri, FINANCE.hasName, Literal(info['longName'], datatype=XSD.string))
    )
    g.add((etf_uri, FINANCE.hasTicker, Literal(info['symbol'], datatype=XSD.string))
    )
    if 'currency' in info:
        g.add((etf_uri, FINANCE.hasCurrency, Literal(info['currency'], datatype=XSD.string))
        )
    return etf_uri

In [30]:
# https://www.ishares.com/us/products/239696/
msci_world_uri = add_etf('URTH')
grep_ticker_data(msci_world_companies_df['Ticker'], msci_world_uri)

skipping earnings_dates for https://finance.yahoo.com/quote/MSFT
skipping earnings_dates for https://finance.yahoo.com/quote/AAPL
skipping earnings_dates for https://finance.yahoo.com/quote/NVDA
skipping earnings_dates for https://finance.yahoo.com/quote/AMZN


KeyboardInterrupt: 

managed to reduce from around 500 invalid tickers to less than 10 because of fetching tickers from a different API.

In [None]:
# https://www.ishares.com/us/products/239726/  | TICKER IVV
sp_500_uri = add_etf('IVV')
grep_ticker_data(sp_500_companies_df['Ticker'], sp_500_uri)

In [None]:
# write code an api request to get the data

In [None]:
# safe kg
g.serialize(destination='data/finance-kg.ttl', format='turtle')

In [None]:
test = yf.Ticker('AZNCF')
# handle keyError for test.earnings_dates
test.get_info()['']

In [ ]:

test.earnings_dates

In [None]:
test.get_info()

In [None]:
test.get_