In [1]:
import os, datetime
import apache_beam as beam
from apache_beam import pvalue
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

In [2]:
class NameFormatFn(beam.DoFn):
    def process(self, element):
        record = element
        name = record.get('Name')
        name = name.upper()
        name = name.replace("\"","")
        name = name.replace("\\","")
        name = name.split()
        
        if len(name) == 1:
            record['Name'] = name[0]
        else:
            if name[0].find(',') == len(name[0]) - 1:
                record['Name'] = name[0] + ' ' + name[1]
            else:
                record['Name'] = name[0] + ', ' + name[1]
                
        if record['Name'][-1] == ',':
            record['Name'] = record['Name'][:-1]
                
        return [record]

In [3]:
PROJECT_ID = 'sound-cider-252823'

# Project ID is needed for BigQuery data source, even for local execution.
options = {
    'project': PROJECT_ID
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

In [4]:
with beam.Pipeline('DirectRunner', options = opts) as p:
    
    #Because of the time limits, we only extract the first 100 rows of data.
    query_results = p | 'Read from BigQuery for txt' >> beam.io.Read(beam.io.BigQuerySource(query = 'SELECT * FROM fec_modeled.Candidates_Beam_DF limit 100'))

    #write PCollection to log file
    query_results | 'Write to input.txt' >> WriteToText('input.txt')

    new_pcoll = query_results | 'Perform name standardization' >> beam.ParDo(NameFormatFn())

    new_pcoll | 'Write to output.txt' >> WriteToText('output.txt')

    qualified_table_name = PROJECT_ID + ':fec_modeled.Candidates_Beam_Jupyter'
    
    table_schema = 'ID:STRING, Name:STRING, Party:STRING, Election_Year:INTEGER, Office_State:STRING, District:INTEGER, Challenge_Status:STRING, Street1:STRING, Street2:STRING, City:STRING, State:STRING, ZIP:INTEGER, Label:STRING, Year:INTEGER'
    
    # write the output results as a table in BigQuery
    new_pcoll | 'Write to BigQuery' >> beam.io.Write(beam.io.BigQuerySink(qualified_table_name,
                                                    schema=table_schema,  
                                                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))


  pipeline.replace_all(_get_transform_overrides(pipeline.options))
