# Task 2b: Extracting data from XBRL 

In [1]:
import os
import random

In [2]:
import pandas as pd

In [8]:
from bs4 import BeautifulSoup

## Fetch a sample of XBRL accounts to check

In [9]:
accounts = {"charity": [], "company_ixbrl": [], "company_pdf": []}
for a in os.listdir("accounts"):
    if a.startswith("GB-CHC"):
        accounts["charity"].append(a)
    elif a.startswith("GB-COH"):
        if a.endswith(".html"):
            accounts["company_ixbrl"].append(a)
        elif a.endswith(".pdf"):
            accounts["company_pdf"].append(a)

In [10]:
to_check = random.sample(accounts["company_ixbrl"], 10)

In [11]:
to_check

['GB-COH-10277465-2017-07-31.html',
 'GB-COH-07686682-2017-05-31.html',
 'GB-COH-05047824-2017-03-31.html',
 'GB-COH-08652494-2017-08-31.html',
 'GB-COH-07148854-2018-03-31.html',
 'GB-COH-07577393-2017-03-31.html',
 'GB-COH-09168420-2017-08-31.html',
 'GB-COH-06759419-2017-03-31.html',
 'GB-COH-10689233-2017-12-31.html',
 'GB-COH-09850848-2017-03-31.html']

## Open accounts and parse with BeautifulSoup

[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) is an engine for exploring XML files.

In [13]:
with open(os.path.join("accounts", to_check[0])) as a:
    soup = BeautifulSoup(a.read(), "html.parser")

## Get XBRL contexts

The context describes what a value applies to - either a time period or a point in time.

In [23]:
contexts = pd.DataFrame([{
    "id": s['id'],
    "entity": s.find('xbrli:identifier').text if s.find('xbrli:identifier') else None,
    "instant": s.find('xbrli:instant').text if s.find('xbrli:instant') else None,
    "startdate": s.find('xbrli:startdate').text if s.find('xbrli:startdate') else None,
    "enddate": s.find('xbrli:enddate').text if s.find('xbrli:enddate') else None,
} for s in soup.find_all({'xbrli:context'})]).set_index("id")
contexts

Unnamed: 0_level_0,enddate,entity,instant,startdate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bfwd_31_07_2017,,10277465,2016-07-12,
cfwd_31_07_2017,,10277465,2017-07-31,
FY_31_07_2017,2017-07-31,10277465,,2016-07-13
cfwd_12_07_2016,,10277465,2016-07-12,
CountriesHypercube_FY_31_07_2017_Set1,2017-07-31,10277465,,2016-07-13
CurrenciesHypercube_FY_31_07_2017_Set2,2017-07-31,10277465,,2016-07-13
EntityOfficersHypercube_FY_31_07_2017_Set3,2017-07-31,10277465,,2016-07-13
LegalFormEntityHypercube_FY_31_07_2017_Set4,2017-07-31,10277465,,2016-07-13
AccountingStandardsAppliedHypercube_FY_31_07_2017_Set5,2017-07-31,10277465,,2016-07-13
AccountsStatusHypercube_FY_31_07_2017_Set6,2017-07-31,10277465,,2016-07-13


Do the same for units found in the data.

In [33]:
units = pd.DataFrame([{
    "id": s['id'],
    "measure": s.find('xbrli:measure').text if s.find('xbrli:measure') else None,
} for s in soup.find_all({'xbrli:unit'})]).set_index("id")
units

Unnamed: 0_level_0,measure
id,Unnamed: 1_level_1
GBP,iso4217:GBP
USD,iso4217:USD
EUR,iso4217:EUR
shares,xbrli:shares
pure,xbrli:pure


## Extract non-financial data

These are values that are generally metadata about the accounts - text, dates, etc. The context is joined in to show what period each value aligns to.

In [30]:
nonnumeric = pd.DataFrame([{
    "context": s['contextref'],
    "name": s['name'], 
    "format": s.get('format'),
    "value": s.text.strip().replace("\n", "")
} for s in soup.find_all({'ix:nonnumeric'})])
nonnumeric = nonnumeric.join(contexts, on='context', how='left')
nonnumeric

Unnamed: 0,context,format,name,value,enddate,entity,instant,startdate
0,FY_31_07_2017,,ns10:NameProductionSoftware,IRIS Accounts Production,2017-07-31,10277465,,2016-07-13
1,FY_31_07_2017,,ns10:VersionProductionSoftware,v18.1.0.975,2017-07-31,10277465,,2016-07-13
2,FY_31_07_2017,,ns10:UKCompaniesHouseRegisteredNumber,10277465,2017-07-31,10277465,,2016-07-13
3,CountriesHypercube_FY_31_07_2017_Set1,ixt2:nocontent,ns10:CountryFormationOrIncorporation,,2017-07-31,10277465,,2016-07-13
4,CurrenciesHypercube_FY_31_07_2017_Set2,ixt2:nocontent,ns10:PrincipalCurrencyUsedInBusinessReport,,2017-07-31,10277465,,2016-07-13
5,EntityOfficersHypercube_FY_31_07_2017_Set3,ixt2:nocontent,ns5:DirectorSigningFinancialStatements,,2017-07-31,10277465,,2016-07-13
6,FY_31_07_2017,,ns5:DescriptionBodyAuthorisingFinancialStatements,director,2017-07-31,10277465,,2016-07-13
7,cfwd_31_07_2017,ixt2:datedaymonthyear,ns10:StartDateForPeriodCoveredByReport,13.7.16,,10277465,2017-07-31,
8,cfwd_31_07_2017,ixt2:datedaymonthyear,ns10:EndDateForPeriodCoveredByReport,31.7.17,,10277465,2017-07-31,
9,cfwd_31_07_2017,ixt2:datedaymonthyear,ns10:BalanceSheetDate,31.7.17,,10277465,2017-07-31,


## Extract financial data

Financial data is help in `ix:nonfraction` fields. This is merged with the contexts and units to get the context for each value

In [53]:
df = pd.DataFrame([{
        "text": s.text,
        **s.attrs
    } for s in soup.find_all({'ix:nonfraction'})])
df = df.join(contexts, how='left', on='contextref')
df = df.join(units, how='left', on='unitref')
df

Unnamed: 0,contextref,decimals,format,name,scale,sign,text,unitref,xmlns:ix,enddate,entity,instant,startdate,measure
0,cfwd_31_07_2017,0,ixt2:numdotdecimal,ns5:CashBankOnHand,0,,1636,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
1,Creditors-SegmentsHypercube_cfwd_31_07_2017_Set1,0,ixt2:numdotdecimal,ns5:Creditors,0,,2971,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
2,cfwd_31_07_2017,0,ixt2:numdotdecimal,ns5:NetCurrentAssetsLiabilities,0,-,1335,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
3,cfwd_31_07_2017,0,ixt2:numdotdecimal,ns5:TotalAssetsLessCurrentLiabilities,0,-,1335,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
4,EquitySOCI-SegmentsHypercube_cfwd_31_07_2017_Set2,0,,ns5:Equity,0,,1,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
5,EquitySOCI-SegmentsHypercube_cfwd_31_07_2017_Set3,0,ixt2:numdotdecimal,ns5:Equity,0,-,1336,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
6,cfwd_31_07_2017,0,ixt2:numdotdecimal,ns5:Equity,0,-,1335,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP
7,Creditors-SegmentsHypercube_cfwd_31_07_2017_Set1,0,ixt2:numdotdecimal,ns5:OtherCreditors,0,,2971,GBP,http://www.xbrl.org/2013/inlineXBRL,,10277465,2017-07-31,,iso4217:GBP


In [54]:
if 'value' not in df.columns:
    df.loc[:, "value"] = df["text"].str.replace(',', '')

Sort out the values to produce a proper value. This is done by:

- replace any items that are just a dash with a 0
- replacing any non-numeric characters in the field with nothing
- converting to a float
- multiplying by -1 if the "sign" field has a "-" in it

In [56]:
df.loc[:, "value_c"] = df.value.str.replace(r'[^0-9\.]', "").astype(float) * df.sign.apply(lambda x: -1 if x=="-" else 1)
df['value_c']

0    1636.0
1    2971.0
2   -1335.0
3   -1335.0
4       1.0
5   -1336.0
6   -1335.0
7    2971.0
Name: value_c, dtype: float64

In [57]:
df[["contextref", "format", "name", "value_c"]]

Unnamed: 0,contextref,format,name,value_c
0,cfwd_31_07_2017,ixt2:numdotdecimal,ns5:CashBankOnHand,1636.0
1,Creditors-SegmentsHypercube_cfwd_31_07_2017_Set1,ixt2:numdotdecimal,ns5:Creditors,2971.0
2,cfwd_31_07_2017,ixt2:numdotdecimal,ns5:NetCurrentAssetsLiabilities,-1335.0
3,cfwd_31_07_2017,ixt2:numdotdecimal,ns5:TotalAssetsLessCurrentLiabilities,-1335.0
4,EquitySOCI-SegmentsHypercube_cfwd_31_07_2017_Set2,,ns5:Equity,1.0
5,EquitySOCI-SegmentsHypercube_cfwd_31_07_2017_Set3,ixt2:numdotdecimal,ns5:Equity,-1336.0
6,cfwd_31_07_2017,ixt2:numdotdecimal,ns5:Equity,-1335.0
7,Creditors-SegmentsHypercube_cfwd_31_07_2017_Set1,ixt2:numdotdecimal,ns5:OtherCreditors,2971.0


## Discovery of contents of XBRL files

This is a quick exercise to discover what is in XBRL files. 

In [58]:
from collections import Counter

Open all our XBRL accounts and look for any tags with a `:` in them, and also the schema used.

In [112]:
counts = {}
for t in accounts["company_ixbrl"]:
    with open(os.path.join("accounts", t), encoding='utf8') as a:
        soup = BeautifulSoup(a.read(), "html.parser")
        counts[t] = dict(Counter([tag.name for tag in soup.find_all() if ":" in tag.name]))
        counts[t]['schema'] = soup.find({'link:schemaref'}).get('xlink:href', '').strip()

Turn this into a dataframe with the tags and a series with the schema. 

In [113]:
counts_df = pd.DataFrame(counts).T.fillna(0)
schema = counts_df['schema']
counts_df = counts_df[[c for c in counts_df.columns if c != 'schema']]

Show the prevelance of different schemas

In [114]:
schema.value_counts()

https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS-102-2014-09-01.xsd          73
http://www.xbrl.org/uk/gaap/core/2009-09-01/uk-gaap-full-2009-09-01.xsd    36
https://xbrl.frc.org.uk/char/2016-01-01/char-2016-01-01.xsd                 1
https://xbrl.frc.org.uk/IFRS/2014-09-01/IFRS-2014-09-01.xsd                 1
Name: schema, dtype: int64

Work out the number of accounts of each schema type with different tags. Divide by the schema types to show the proportion of accounts of that type with each tag.

In [115]:
totals = counts_df.astype(bool).join(schema).groupby("schema").sum().T
(totals / schema.value_counts()).multiply(100).round(1)

Unnamed: 0,http://www.xbrl.org/uk/gaap/core/2009-09-01/uk-gaap-full-2009-09-01.xsd,https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS-102-2014-09-01.xsd,https://xbrl.frc.org.uk/IFRS/2014-09-01/IFRS-2014-09-01.xsd,https://xbrl.frc.org.uk/char/2016-01-01/char-2016-01-01.xsd
core:parententityorcontrollingpartyandultimatecontrollingpartygroupingdimension.domain,0.0,1.4,0.0,0.0
core:xeventafterreportingdategroupingdimension.domain,0.0,11.0,0.0,0.0
core:xotherspecificaccountingpolicygroupingdimension.domain,0.0,1.4,0.0,0.0
frs-common:analysisdimension.domain,0.0,5.5,0.0,0.0
frs-core:parententityorcontrollingpartyandultimatecontrollingpartygroupingdimension.domain,0.0,5.5,0.0,0.0
frs-core:xeventafterreportingdategroupingdimension.domain,0.0,5.5,0.0,0.0
ix:continuation,0.0,5.5,0.0,0.0
ix:exclude,0.0,5.5,0.0,0.0
ix:header,100.0,100.0,100.0,100.0
ix:hidden,100.0,100.0,100.0,100.0


In [120]:
counts_df.median().sort_values(ascending=False)

ix:nonnumeric                                                                                 26.0
ix:nonfraction                                                                                18.0
xbrli:period                                                                                  12.0
xbrli:identifier                                                                              12.0
xbrli:entity                                                                                  12.0
xbrli:context                                                                                 12.0
xbrldi:explicitmember                                                                         11.0
xbrli:segment                                                                                  9.0
xbrli:enddate                                                                                  7.0
xbrli:startdate                                                                                7.0
xbrli:inst

In [121]:
schema

GB-COH-00889858-2017-09-30.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-02978957-2017-10-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-03054343-2017-03-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-03171108-2017-03-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-03874497-2017-10-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-04001326-2017-05-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-04173535-2017-09-30.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-04189619-2017-03-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-04274007-2018-03-31.html    https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS...
GB-COH-04577725-2017-03-31.html    http://www.xbrl.org/uk/gaap/core/2009-09-01/uk...
GB-COH-04964138-2017-03-31.html    http://www.xbrl.org/uk/gaap/core/2009-09-01/uk...
GB-COH-05047824-2017-03-31.html    http://www.xbrl.org/uk/gaap/co