# Download account documents from a list

@todo:

- support pagination of accounts to find older documents

## Setup the download

In [2]:
import requests
from requests.auth import HTTPBasicAuth
import csv
from tqdm import tqdm_notebook
import pandas as pd
import requests_cache

Use the `requests_cache` module to prevent multiple requests to the same URL

In [3]:
requests_cache.install_cache('http_cache')

Get a Companies House API key

In [4]:
with open('api_key.txt') as a:
    API_KEY = a.read()

URL of the filing history for a company (designed to use with `.format()`)

In [5]:
filing_history_url = 'https://api.companieshouse.gov.uk/company/{}/filing-history'

## Fetch organisation numbers to download

Company & charity numbers

In [13]:
orgs = []
with open('orgs-to-check/ptc.csv') as c:
    reader = csv.reader(c)
    orgs = [r for r in reader]

In [19]:
companies = [o[0].replace("GB-COH-", "") for o in orgs if o[0].startswith("GB-COH-") and len(o[0])==15]
charities = [o[0].replace("GB-CHC-", "") for o in orgs if o[0].startswith("GB-CHC-") and len(o[0]) in [13,14]]
"{:,.0f} companies and {:,.0f} charities to find".format(len(companies), len(charities))

'563 companies and 444 charities to find'

## Fetch documents for companies

For each registered company fetch the metadata about the first record in their filing history

In [30]:
filing_history = []
for company_number in tqdm_notebook(companies):
    r = requests.get(filing_history_url.format(company_number), auth=HTTPBasicAuth(API_KEY, ''))
    if 'items' in r.json():
        for i in r.json()['items']:
            # make sure we're getting the right kind of records
            if i['category']=='accounts' and i['description'].startswith('accounts-'):
                filing_history.append(i)
                break

HBox(children=(IntProgress(value=0, max=563), HTML(value='')))




In [31]:
filing_history = pd.DataFrame(pd.io.json.json_normalize(filing_history))
"{:,.0f} accounts found".format(len(filing_history))

'363 accounts found'

Show the types of accounts

In [32]:
filing_history['description'].value_counts()

accounts-with-accounts-type-total-exemption-full            198
accounts-with-accounts-type-micro-entity                    110
accounts-with-accounts-type-dormant                          15
accounts-with-accounts-type-small                            12
accounts-with-accounts-type-unaudited-abridged                8
accounts-with-accounts-type-total-exemption-small             7
accounts-amended-with-accounts-type-total-exemption-full      5
accounts-with-accounts-type-full                              4
accounts-with-accounts-type-group                             4
Name: description, dtype: int64

In [44]:
filing_history['paper_filed'].value_counts(dropna=False)

True    250
NaN     113
Name: paper_filed, dtype: int64

In [79]:
results = {}

# Go through each account we're looking for
for k, f in tqdm_notebook(filing_history[filing_history['links.document_metadata'].notnull()].iterrows()):
    
    # fetch the document metadata
    r = requests.get(f['links.document_metadata'], auth=HTTPBasicAuth(API_KEY, ''))
    filetype = 'pdf'
    content_type = 'application/pdf'
    try:
        _ = r.json()
    except:
        r.headers
        continue

    # if it looks like there's XBRL data then request than
    if 'application/xhtml+xml' in r.json().get('resources'):
        filetype = 'html'
        content_type = 'application/xhtml+xml'
        
    # fetch the actual document
    doc = requests.get(
        r.json()['links']['document'],
        auth=HTTPBasicAuth(API_KEY, ''),
        headers={"Accept": content_type}
    )
    
    action_date = f['description_values.made_up_date']
    
    # work out the filename we're going to use
    filename = 'accounts/GB-COH-{}-{}.{}'.format(
        r.json().get('company_number'),
        action_date,
        filetype
    )
        
    # add to our results
    results[k] = {
        "filename": filename,
        "filetype": filetype,
        "company_number": r.json().get('company_number')
    }
    
    # write the file to the file system
    with open(filename, 'wb') as a:
        _ = a.write(doc.content)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [80]:
filing_history = filing_history.drop(columns=['filename', 'filetype', 'company_number'], errors='ignore').join(pd.DataFrame(results).T, how='left')

In [81]:
filing_history['filetype'].value_counts(dropna=False)

pdf     246
html    111
NaN       6
Name: filetype, dtype: int64

In [82]:
filing_history.to_csv('company_accounts.csv')

In [87]:
pd.crosstab(
    filing_history['filetype'],
    filing_history['paper_filed'].fillna(False)
)

paper_filed,False,True
filetype,Unnamed: 1_level_1,Unnamed: 2_level_1
html,111,0
pdf,2,244


## Fetch documents for charities

In [122]:
from bs4 import BeautifulSoup
import datetime

In [104]:
cc_url = 'http://beta.charitycommission.gov.uk/charity-details/?regid={}&subid=0'

In [133]:
accounts = []
for c in tqdm_notebook(charities):
    r = requests.get(cc_url.format(c))
    soup = BeautifulSoup(r.text, 'html.parser')
    for a in soup.select('div.doc-container'):
        link = a.find(class_='doc')
        accounts.append({
            "charity_number": c,
            "accounts_url": link.get('href'),
            "financial_year_end": datetime.datetime.strptime(link.find(class_='doc-name').get_text(), '%d %b %Y')
        })

HBox(children=(IntProgress(value=0, max=444), HTML(value='')))




In [134]:
accounts = pd.DataFrame(accounts)

In [143]:
latest_accounts = accounts.sort_values(["charity_number", "financial_year_end"]).groupby("charity_number").last()
results = {}
for c, a in tqdm_notebook(latest_accounts.iterrows()):
    filename = 'accounts/GB-CHC-{}-{}.{}'.format(
        c,
        a['financial_year_end'].strftime('%Y-%m-%d'),
        'pdf'
    )
    
    doc = requests.get(a['accounts_url'])
    results[a['accounts_url']] = filename
    
    # write the file to the file system
    with open(filename, 'wb') as a:
        _ = a.write(doc.content)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [148]:
accounts = accounts.join(pd.Series(results).rename('filename'), how='left', on='accounts_url')

In [149]:
accounts.to_csv('charity_accounts.csv')

## Generate a merged list with the accounts downloaded and a list of the files

In [151]:
filing_history

Unnamed: 0,action_date,barcode,category,date,description,description_values.made_up_date,links.document_metadata,links.self,pages,paper_filed,transaction_id,type,company_number,filename,filetype
0,2017-09-30,X71YYNOQ,accounts,2018-03-18,accounts-with-accounts-type-micro-entity,2017-09-30,https://frontend-doc-api.companieshouse.gov.uk...,/company/00889858/filing-history/MzIwMDMwODMyN...,6.0,,MzIwMDMwODMyNGFkaXF6a2N4,AA,00889858,accounts/GB-COH-00889858-2017-09-30.html,html
1,2017-03-31,A6LN6R4I,accounts,2017-12-28,accounts-with-accounts-type-full,2017-03-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/01759876/filing-history/MzE5MzU3NDM0O...,16.0,True,MzE5MzU3NDM0OGFkaXF6a2N4,AA,01759876,accounts/GB-COH-01759876-2017-03-31.pdf,pdf
2,2017-03-31,A6XUJXEY,accounts,2018-01-18,accounts-with-accounts-type-total-exemption-full,2017-03-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/02203518/filing-history/MzE5NTU4ODg2N...,22.0,True,MzE5NTU4ODg2NmFkaXF6a2N4,AA,02203518,accounts/GB-COH-02203518-2017-03-31.pdf,pdf
3,2017-07-31,A6KFOPYU,accounts,2017-12-08,accounts-with-accounts-type-small,2017-07-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/02381821/filing-history/MzE5MjIxMTA2O...,21.0,True,MzE5MjIxMTA2OGFkaXF6a2N4,AA,02381821,accounts/GB-COH-02381821-2017-07-31.pdf,pdf
4,2017-06-30,A6LSSZPT,accounts,2018-01-03,accounts-with-accounts-type-full,2017-06-30,https://frontend-doc-api.companieshouse.gov.uk...,/company/02599428/filing-history/MzE5NDM2NjAzN...,43.0,True,MzE5NDM2NjAzNGFkaXF6a2N4,AA,02599428,accounts/GB-COH-02599428-2017-06-30.pdf,pdf
5,2017-03-31,L6M7R6CX,accounts,2018-01-05,accounts-with-accounts-type-total-exemption-full,2017-03-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/02608803/filing-history/MzE5NDU2NTIyN...,4.0,True,MzE5NDU2NTIyNGFkaXF6a2N4,AA,02608803,accounts/GB-COH-02608803-2017-03-31.pdf,pdf
6,2017-10-31,X78DMLU3,accounts,2018-06-18,accounts-with-accounts-type-total-exemption-full,2017-10-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/02978957/filing-history/MzIwNzYxNjAwM...,5.0,,MzIwNzYxNjAwMWFkaXF6a2N4,AA,02978957,accounts/GB-COH-02978957-2017-10-31.html,html
7,2017-03-31,X6K8IG57,accounts,2017-11-29,accounts-with-accounts-type-total-exemption-full,2017-03-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/03054343/filing-history/MzE5MTUwMzgxN...,11.0,,MzE5MTUwMzgxNmFkaXF6a2N4,AA,03054343,accounts/GB-COH-03054343-2017-03-31.html,html
8,2017-03-31,X6LJZAMH,accounts,2017-12-18,accounts-with-accounts-type-total-exemption-full,2017-03-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/03171108/filing-history/MzE5MzExOTM0N...,8.0,,MzE5MzExOTM0NWFkaXF6a2N4,AA,03171108,accounts/GB-COH-03171108-2017-03-31.html,html
9,2017-12-31,A7HP6LXD,accounts,2018-11-07,accounts-with-accounts-type-total-exemption-full,2017-12-31,https://frontend-doc-api.companieshouse.gov.uk...,/company/03296566/filing-history/MzIxODU5NDcwO...,21.0,True,MzIxODU5NDcwOGFkaXF6a2N4,AA,03296566,accounts/GB-COH-03296566-2017-12-31.pdf,pdf


In [189]:
f = filing_history[
        ['company_number', 'filetype', 'filename', 'description_values.made_up_date']
    ].rename(columns={
        'description_values.made_up_date': 'financial_year_end'
    })
f.loc[:, 'charity_number'] = None
f.loc[:, 'identifier'] = 'GB-COH-' + f['company_number']
f = f.set_index('identifier')

In [193]:
g = accounts[['charity_number', 'filename', 'financial_year_end']]
g.loc[:, 'company_number'] = None
g.loc[:, 'identifier'] = 'GB-CHC-' + g['charity_number']
g.loc[g['filename'].notnull(), 'filetype'] = 'pdf_ocr'
g = g.set_index('identifier')

In [196]:
fields = ['charity_number', 'company_number', 'filetype', 'filename', 'financial_year_end']
results = pd.concat([f[fields], g.loc[g['filename'].notnull(), fields]])

In [212]:
orgids = pd.Series([o[0] for o in orgs])
results = results.append(pd.DataFrame(index=orgids[~orgids.isin(results.index)].rename('identifier')), sort=False)

In [213]:
results.to_csv("account_download_results.csv")

In [216]:
results.loc[results.index.str.startswith('GB-COH-').fillna(False), "org_type"] = 'Company'
results.loc[results.index.str.startswith('GB-CHC-').fillna(False), "org_type"] = 'Charity'

In [241]:
results.loc[
    results.index.str.contains(r'(^GB-COH-(IP|RS)|R$)').fillna(False),
    "org_type"
] = 'Registered Society'

  after removing the cwd from sys.path.


In [227]:
results.loc[results.filename.isnull(), 'filetype'] = 'not found'

In [1]:
pd.crosstab(
    results.org_type,
    results.filetype,
    margins=True
).drop("All").sort_values("All", ascending=False)

NameError: name 'pd' is not defined

In [254]:
company_types = {}
for c in tqdm_notebook(results.index):
    try:
        if c.startswith("GB-COH-"):
            r = requests.get('http://data.companieshouse.gov.uk/doc/company/{}.json'.format(c.replace('GB-COH-', '')))
            company_types[c] = r.json().get('primaryTopic', {}).get('CompanyCategory')
    except:
        print("not found for {}".format(c))
        continue

HBox(children=(IntProgress(value=0, max=1023), HTML(value='')))

not found for GB-COH-04554636
not found for nan
not found for nan
not found for nan
not found for nan
not found for nan
not found for nan
not found for GB-COH-00001504
not found for GB-COH-00001589
not found for GB-COH-00007231
not found for GB-COH-00007721
not found for GB-COH-00007779
not found for GB-COH-00007812
not found for GB-COH-0029726R
not found for GB-COH-0030219R
not found for GB-COH-0031409R
not found for GB-COH-0031907R
not found for GB-COH-0032211R
not found for GB-COH-0032339R
not found for GB-COH-01155534
not found for GB-COH-02711204
not found for GB-COH-0319444R
not found for GB-COH-08783360)
not found for GB-COH-0LS-1571
not found for GB-COH-30785791
not found for GB-COH-IP031107R


In [259]:
results.loc[results.org_type=='Company', "org_type"] = None
company_types = pd.Series(company_types)
results.loc[:, "org_type"] = results.org_type.fillna(company_types)

In [265]:
results.loc[results.org_type.str.contains('limited by guarantee', case=False).fillna(False), "org_type"] = "CLG"

In [266]:
results.org_type.value_counts()

Charity                          452
Community Interest Company       222
Registered Society               146
CLG                              124
Private Limited Company           51
Converted/Closed                   7
Limited Partnership                1
Limited Liability Partnership      1
Name: org_type, dtype: int64

In [273]:
results.to_csv("account_check_results.csv")