In [1]:
import yaml

with open('schema.yaml', 'r') as f:
    col = yaml.safe_load(f)

In [2]:
start = 1
awk_input = []
for c, width in col:
    if not c.startswith('FILLER'):
        awk_input.append(f'substr($0,{start},{width})')
    start += width

awk_input = ', '.join(awk_input)

In [3]:
import glob, re

inputs = glob.glob('data/raw/*.txt')
years = [re.compile(r'Nat([0-9]{4})PublicUS').search(i).group(1) for i in inputs]
col_final = '|'.join([c for c, _ in col if not c.startswith('FILLER')])

for y, i in zip(years, inputs):
    !awk -v OFS='|' '{{ print $awk_input }}' $i > data/processed/births{y}.txt
    !sed -i "1i $col_final" data/processed/births{y}.txt

In [4]:
!gsutil -m cp -r data/processed/*.txt gs://mother-goose-data/nvss/

Copying file://data/processed/births2016.txt [Content-Type=text/plain]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://data/processed/births2017.txt [Content-Type=text/plain]...
Copying file://data/processed/births2018.txt [Content-Type=text/plain]...       
Copying file://data/processed/births2019.txt [Content-Type=text/plain]...       
\ [4

In [5]:
from google.cloud import bigquery

client = bigquery.Client()

schema = [bigquery.SchemaField(c, "INTEGER" if c == 'DOB_YY' else "STRING") \
          for c, _ in col if not c.startswith('FILLER')]
table = bigquery.Table('mother-goose-health.nvss.births', schema=schema)
table.range_partitioning = bigquery.RangePartitioning(
    field="DOB_YY",
    range_=bigquery.PartitionRange(start=1900, end=2100, interval=1),
)

client.delete_table('mother-goose-health.nvss.births', not_found_ok=True)
table = client.create_table(table)

job_config = bigquery.LoadJobConfig(
    schema=schema,
    skip_leading_rows=1,
    source_format=bigquery.SourceFormat.CSV,
    field_delimiter='|'
)

load_job = client.load_table_from_uri(
    'gs://mother-goose-data/nvss/*.txt', 
    table, 
    job_config=job_config
)
load_job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7fc44c24c100>