In [151]:
from bs4 import BeautifulSoup
import json

In [136]:
calculation_file = "adobe/adbe-20211203_cal.xml"
definition_file = "adobe/adbe-20211203_def.xml"
label_file = "adobe/adbe-20211203_lab.xml"
extracted_file = "adobe/adbe-20211203_htm.xml"

In [137]:
with open(calculation_file) as file:
    soup = BeautifulSoup(file.read(), 'lxml')

In [138]:
calculation_links = soup.find_all('link:calculationlink')

for i, calculation_link in enumerate(calculation_links):
    print(f"{(i+1):2}.", calculation_link['xlink:role'].split('/')[-1])

 1. ConsolidatedBalanceSheets
 2. ConsolidatedStatementsofIncome
 3. ConsolidatedStatementsofComprehensiveIncomeStatement
 4. ConsolidatedStatementsofCashFlows
 5. CashCashEquivalentsandShortTermInvestmentsDetails2
 6. PropertyandEquipmentDetails
 7. GoodwillandOtherIntangiblesDetails2
 8. AccruedExpensesDetails
 9. IncomeTaxesDetails1
10. IncomeTaxesDetails2
11. IncomeTaxesDetails3
12. IncomeTaxesDetails4
13. AccumulatedOtherComprehensiveIncomeLossDetails1
14. NetIncomePerShareDetails
15. CommitmentsandContingenciesDetails
16. LeasesDetails2
17. NonOperatingIncomeExpenseDetails


In [139]:
financial_statements = {}

In [141]:
for calculation_link in calculation_links:
    table_name = calculation_link['xlink:role'].split('/')[-1]
    financial_statements[table_name] = {'attribs':[], 'attribs_lower':[]}
    table_loc = calculation_link.find_all("link:loc")
    for i in table_loc:
        attrib =':'.join(i['xlink:label'].split('_')[1:3])
        if attrib not in financial_statements[table_name]['attribs']:
            financial_statements[table_name]['attribs'].append(attrib)
            financial_statements[table_name]['attribs_lower'].append(attrib.lower())

In [142]:
with open(extracted_file) as file:
    soup = BeautifulSoup(file.read(), 'lxml')

In [143]:
context_list = []
for i in soup.find_all('context'):
    context_list.append(i['id'])

In [144]:
context_list

['i53336a4ddbca49e3b5c812f7bc0fad41_D20201128-20211203',
 'i2f87b70069ab448d8d376d7cfd804189_I20210604',
 'ia5c328dfc89f42788dacfb4cb71c9062_I20220114',
 'i9142a2b8eaf8432096db4564419d27c0_I20211203',
 'i96964ec305bb4ab6ad5f584a46f8d789_I20201127',
 'i7aa7327b105d47b2a13086fecce3ac1e_D20191130-20201127',
 'i923c259df8f34ca88434813e8637d9b3_D20181201-20191129',
 'i58133d2a631a4d67aa17fd285ee90639_I20181130',
 'i7f352ddaf165482894186e4fb6370985_I20181130',
 'i8fb3673b7c254bb3be7e6c54b3a8c94c_I20181130',
 'if439772eaed44431b618123cdb195b91_I20181130',
 'i539cc9b83b0a496dae4d947213e71ae0_I20181130',
 'i1024f0832622423aba7e849f9826cc26_I20181130',
 'ib24279fb4bb74f0d94338a82f231399c_D20181201-20191129',
 'i0c150451f16140a7baba1af345114228_D20181201-20191129',
 'id560e73acea74c3fbcf7b3cdd76685af_D20181201-20191129',
 'i04da207696f341c7a00d3d475bea064c_D20181201-20191129',
 'i9501c8ad2f4a4cd291b85d7451e21eaa_I20191129',
 'i0cdb62a3a2e24eefbf90e91a419aea71_I20191129',
 'if4bfdc6da71640ee9e402e

In [147]:
for table in financial_statements.keys():
    contextref_count = {}
    for attrib in financial_statements[table]['attribs_lower']:
        contextref_list = []
        for i in soup.find_all(attrib.lower()):
            if i['contextref'] not in contextref_list:
                contextref_list.append(i['contextref'])
        contextref_list.sort()
        contextref_list = " ".join(contextref_list)
        if contextref_list in contextref_count:
            contextref_count[contextref_list]+=1
        else:
            contextref_count[contextref_list]=1
    contexts, _ = max(contextref_count.items(), key = lambda k : k[1])
    contexts = contexts.split()
    for context in contexts:
        x = soup.find_all(attrs = {'contextref':context})
        period = context.split('_')[-1]
        financial_statements[table][period] = {}
        for i in x:
            if i.name in financial_statements[table]['attribs_lower']:
                financial_statements[table][period][i.name] = i.get_text()

In [148]:
for i in financial_statements.keys():
    financial_statements[i].pop('attribs')
    financial_statements[i].pop('attribs_lower')
financial_statements

{'ConsolidatedBalanceSheets': {'I20211203': {'us-gaap:cashandcashequivalentsatcarryingvalue': '3844000000',
   'us-gaap:shortterminvestments': '1954000000',
   'us-gaap:accountsreceivablenetcurrent': '1880000000',
   'us-gaap:prepaidexpenseandotherassetscurrent': '993000000',
   'us-gaap:assetscurrent': '8669000000',
   'us-gaap:propertyplantandequipmentnet': '1673000000',
   'us-gaap:operatingleaserightofuseasset': '443000000',
   'us-gaap:goodwill': '12668000000',
   'us-gaap:finitelivedintangibleassetsnet': '1820000000',
   'us-gaap:deferredincometaxassetsnet': '1085000000',
   'us-gaap:otherassetsnoncurrent': '883000000',
   'us-gaap:assets': '27241000000',
   'us-gaap:accountspayablecurrent': '312000000',
   'us-gaap:accruedliabilitiescurrent': '1736000000',
   'us-gaap:contractwithcustomerliabilitycurrent': '4733000000',
   'us-gaap:accruedincometaxescurrent': '54000000',
   'us-gaap:operatingleaseliabilitycurrent': '97000000',
   'us-gaap:liabilitiescurrent': '6932000000',
   'u

In [157]:
json_file = json.dumps(financial_statements, indent=4)
with open(calculation_file.replace('cal.xml', 'statements.json'), 'w') as file:
    file.write(json_file)