In [1]:
import yaml

with open('schema.yaml', 'r') as f:
    col = yaml.safe_load(f)

In [2]:
import pandas as pd

df_iter = pd.read_fwf('data/raw/Nat2019PublicUS.c20200506.r20200915.txt', 
                      widths=[w for _, w in col],
                      header=None,
                      names=[c for c, _ in col],
                      dtype={c: object for c, _ in col},
                      chunksize=100000)

for i, df in enumerate(df_iter):
    df.drop(columns=[c for c, _ in col if c.startswith('FILLER')], inplace=True)
    df.to_csv(f'data/processed/births2019_{str(i).zfill(5)}.txt', sep='|', index=False)

In [6]:
!gsutil -m cp -r data/processed/*.txt gs://mother-goose-data/nvss/

Copying file://data/processed/births2019_00000.txt [Content-Type=text/plain]...
Copying file://data/processed/births2019_00001.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00002.txt [Content-Type=text/plain]...
Copying file://data/processed/births2019_00003.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00005.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00006.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00004.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00007.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00008.txt [Content-Type=text/plain]... 
Copying file://data/processed/births2019_00009.txt [Content-Type=text/plain]... 
Resuming upload for file://data/processed/births2019_00009.txt                  
Copying file://data/processed/births2019_00010.txt [Content-Type=text/plain]...
Copying file://data/processed/b

In [7]:
from google.cloud import bigquery

client = bigquery.Client()

schema = [bigquery.SchemaField(c, "INTEGER" if c == 'DOB_YY' else "STRING") \
          for c, _ in col if not c.startswith('FILLER')]
table = bigquery.Table('mother-goose-health.nvss.births', schema=schema)
table.range_partitioning = bigquery.RangePartitioning(
    field="DOB_YY",
    range_=bigquery.PartitionRange(start=1900, end=2100, interval=1),
)

client.delete_table('mother-goose-health.nvss.births', not_found_ok=True)
table = client.create_table(table)

job_config = bigquery.LoadJobConfig(
    schema=schema,
    skip_leading_rows=1,
    source_format=bigquery.SourceFormat.CSV,
    field_delimiter='|'
)

load_job = client.load_table_from_uri(
    'gs://mother-goose-data/nvss/*.txt', 
    table, 
    job_config=job_config
)
load_job.result()

<google.cloud.bigquery.job.load.LoadJob at 0x7efcbe2ccbe0>