In [1]:
import os, datetime
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

In [2]:
class NameFormatFn(beam.DoFn):
    def process(self, element):
        record = element
        name = record.get('Name')
        name = name.upper()
        name = name.replace("\"","")
        name = name.replace("\\","")
        name = name.split()
        
        if len(name) == 1:
            record['Name'] = name[0]
        else:
            if name[0].find(',') == len(name[0]) - 1:
                record['Name'] = name[0] + ' ' + name[1]
            else:
                record['Name'] = name[0] + ', ' + name[1]
                
        if record['Name'][-1] == ',':
            record['Name'] = record['Name'][:-1]
                
        return [record]

In [3]:
PROJECT_ID = 'sound-cider-252823'
BUCKET = 'gs://gerryandhao'
DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

options = {
    'runner': 'DataflowRunner',
    'job_name': 'transform-candidates',
    'project': PROJECT_ID,
    'temp_location': BUCKET + '/temp',
    'staging_location': BUCKET + '/staging',
    'machine_type': 'n1-standard-4', # machine types listed here: https://cloud.google.com/compute/docs/machine-types
    'num_workers': 4
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

In [None]:
with beam.Pipeline('DataflowRunner', options = opts) as p:
    
    #Because of the time limits, we only extract the first 100 rows of data.
    query_results = p | 'Read from BigQuery' >> beam.io.Read(beam.io.BigQuerySource(query = 'SELECT * FROM fec_modeled.Candidates_Beam_DF'))


    new_pcoll = query_results | 'Perform name standardization' >> beam.ParDo(NameFormatFn())    
       
    dataset_id = 'fec_modeled'
    table_id ='Candidates_Beam_DF_Jupyter'
    schema_id = 'ID:STRING, Name:STRING, Party:STRING, Election_Year:INTEGER, Office_State:STRING, District:INTEGER, Challenge_Status:STRING, Street1:STRING, Street2:STRING, City:STRING, State:STRING, ZIP:INTEGER, Label:STRING, Year:INTEGER'
    
    # write the output results as a table in BigQuery
    new_pcoll | 'Write to BigQuery' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                    table=table_id, 
                                                    schema=schema_id,
                                                    project=PROJECT_ID,
                                                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                    batch_size=int(100))
    
    result = p.run()
    result.wait_until_finish()