# Montana banks PDF parser

A basic example of using [`pdfplumber`](https://github.com/jsvine/pdfplumber) to extract data from a PDF. We'll parse [a short PDF of state-chartered banks in Montana](http://banking.mt.gov/Portals/58/Bank_List.pdf) and flatten the data so that one record = one branch, then write out to JSON.

### Import libraries

In [108]:
import json
from datetime import datetime

import requests
import pdfplumber

### Define our key variables

In [109]:
# where the PDF lives online
URL = 'http://banking.mt.gov/Portals/58/Bank_List.pdf'

# path to the file we'll download the PDF as
PDF = 'mtbanks.pdf'

# path to the JSON file we'll write out to
JSON = 'mtbanks.json'

### Download the PDF

In [110]:
r = requests.get(URL)
r.raise_for_status()

with open(PDF, 'wb') as o:
    for block in r.iter_content(1024):
        o.write(block)

### Define a function to extract table data from one page

Each branch record will have the name of the bank, the city, the state and a boolean showing whether it's the main branch. The "other branch" cell gets split by state, then by city.

In [114]:
def extract_table_data(page):
    '''Given a page of the bank list PDF, extract the data and return a list of dictionaries'''

    keys = ['bank_name', 'city', 'state', 'main_branch']
    data = []
    
    table = page.extract_table()
    
    for row in table[1:]:
        bankname, main_branch, other_branches = row
        bankname_clean = bankname.replace(' (continued)', '')

        if main_branch:
            data.append(dict(zip(keys, [bankname_clean, main_branch, 'Montana', True])))

        if other_branches:
            split_by_state = other_branches.split('\n \n')
            for s in split_by_state:
                citysplit = s.split(':')
                if len(citysplit) == 2:
                    state, cities = citysplit
                else:
                    state = 'Montana'
                    cities = citysplit[0]
                city_list = cities.split(',')
                clean_list = [x.strip().replace('\n', '') for x in city_list if x.strip()]
                
                for city in clean_list:
                    data.append(dict(zip(keys, [bankname_clean, city, state.title(), False])))

    return data

### Roll through the PDF

In [116]:
# open the PDF and the JSON file we're writing out to
with pdfplumber.open(PDF) as p, open(JSON, 'w') as js:
    
    # get the updated date from the top matter
    firstpage = p.pages[0]
    top_info = (0, 0, firstpage.width, 100)
    chars = p.pages[0].crop(top_info).objects['char']
    updated = ''.join([x['text'] for x in chars]).split('As of ')[-1].strip()
    u_date = datetime.strptime(updated, '%B %d, %Y').date()
    
    # the dict we're gonna write out to
    banks = {
        'updated': str(u_date),
        'data': []
    }
    
    # loop over the pages
    for page in p.pages:
        # call the extract function
        data = extract_table_data(page)
        
        # add to the `data` list inside the `banks` dict
        banks['data'].extend(data)
        
    # write to file
    js.write(json.dumps(banks))