Data comes from [OpenAddresses](https://batch.openaddresses.io/data) and must be downloaded after logging in. Assume it's all in ./data. I had to do the following to get it prepared:

1. UK - data is already in the folder as a test set. It's tsv.
1. Manitoba - came as a gzipped geojson. `gunzip ca_mb_province-addresses-state.geojson.gz`
1. US midwest - came as a zip file which unzips to a large folder. `unzip collection-us-midwest.zip`

In [1]:
import json
import os
import random

In [2]:
def shuffle_lines(file):
    fh = open(file, 'r')
    lines = fh.readlines()
    fh.close()
    random.shuffle(lines)
    fh = open(file, 'w').writelines(lines)

In [6]:
# UK
fh = open("./data/uk_openaddresses_formatted_addresses_tagged.random.tsv", "r")
fh_out_jk = open("./data/training/uk_geojson_reduced.geojson", "w")
fh_out_raw = open("./data/rawtext/uk_geojson_reduced.txt", "w")
fh_out_meta = open("./data/training/uk_geojson_reduced.meta.json", "w")

key_map = {
    'house_number': 'number',
    'road': 'street',
    'postcode': 'postcode',
    'city': 'city',
    'country': 'country'
}

num_lines = 0
for line in fh:
    line = line.strip().split('\t')[2]
    line = line.replace('|/FSEP', '')
    d = {}
    
    for part in line.split(' '):
        if not part:
            continue
        
        if part.startswith('//'):
            data, code = '/', part[2:]
        else:
            try:
                data, code = part.split('/')
            except ValueError:
                print(line)
                raise 
        
        if code == 'SEP':
            continue
            
        if key_map[code] not in d:
            d[key_map[code]] = []
        d[key_map[code]].append(data)
    
    num_lines += 1
    final_d = {k: ' '.join(v) for k, v in d.items()}
    final_d_s = json.dumps(final_d)
    fh_out_jk.write(f"{final_d_s}\n")
    fh_out_raw.write(f"{' '.join(final_d.values())}\n")
    
metadata = {'file': 'uk_geojson_reduced.geojson', 'numlines': num_lines}
fh_out_meta.write(json.dumps(metadata))

fh_out_jk.close()
fh_out_raw.close()
fh_out_meta.close()

shuffle_lines('./data/training/uk_geojson_reduced.geojson')

In [4]:
# US
def process_us_county(file_path, out_file_path, out_raw_path, out_meta_path, county, state):
    for t_path in [out_file_path, out_raw_path, out_meta_path]:
        out_path = os.path.dirname(t_path)
        if not os.path.exists(out_path):
            os.makedirs(out_path)
    
    fh = open(file_path, 'r')
    fh_output = open(out_file_path, 'w')
    fh_output_raw = open(out_raw_path, 'w')
    fh_output_meta = open(out_meta_path, 'w')
    
    keys = {'number', 'street', 'unit', 'city', 'district', 'region', 'postcode', 'country'}
    num_lines = 0
    for line in fh:
        line = json.loads(line)['properties']
        line = {k: v for k, v in line.items() if k in keys}
        
        if not line.get('country'):
            line['country'] = 'USA'
        if not line.get('region'):
            line['region'] = state
        if not line.get('district'):
            line['district'] = county
        
        num_lines += 1
        data = json.dumps(line)
        fh_output.write(f"{data}\n")
        fh_output_raw.write(f"{' '.join(line.values())}\n")
        
                
    fh.close()
    fh_output.close()
    fh_output_raw.close()
    shuffle_lines(out_file_path)
    
    _, out_file_name = os.path.split(out_file_path)
    metadata = {'numlines': num_lines, 'file': out_file_name}
    fh_output_meta.write(json.dumps(metadata))
    fh_output_meta.close()


In [5]:
process_us_county("data/us/il/cook-addresses-county.geojson", 'data/training/us/il/cook_county_us.geojson', 'data/rawtext/us/il/cook_county_us.txt', 'data/training/us/il/cook_county_us.meta.json', 'Cook County', 'Illinois')
process_us_county("data/us/mn/ramsey-addresses-county.geojson", 'data/training/us/mn/ramsey-addresses-county.geojson', 'data/rawtext/us/mn/ramsey-addresses-county.txt', 'data/training/us/mn/ramsey-addresses-county.meta.json', 'Ramsey County', 'Minnesota')