### Pipeline que busca arquivos zipados na GCS, concatena e salva novamente na GCS

#### Importa bibliotecas

In [1]:
import zipfile
from zipfile import ZipFile
import google.auth
import apache_beam as beam
from apache_beam.io import fileio
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.dataframe import convert
import argparse
import os
import json
import pandas as pd

#### Autorizações para acesso a GCS e cria bucket para arquivos temporários

In [2]:
project = google.auth.default()[1]
options = pipeline_options.PipelineOptions(
    streaming=True,
    project=project
)

In [3]:
#cria bucket para envio dos aquivos temporários
#!gsutil mb  gs://temporariospj
bucket='temporariospj'

#### Define a função run que executa todos os passos da pipeline
##### Usa options para rodar localmente e argv para o DataFlowRunner


In [4]:
def run():
    #Lê arquivo zipado, descomprime e separa por linhas
    def unzip(readable_file):
        import zipfile
        from zipfile import ZipFile
        arquivo_zip=zipfile.ZipFile(readable_file.open('r'))
        nome=arquivo_zip.namelist()
        arquivo=arquivo_zip.read(nome[0])
        arquivo_json= json.loads(arquivo.decode("utf-8"))
        yield arquivo_json
    def dataframe(readable_file):
        import pandas as pd
        df=pd.DataFrame.from_dict(readable_file)
        yield df
    #Informações para rodar pipeline no DataFlow
    argv = [
      '--project={0}'.format(project),
      '--region=us-central1',  
      '--staging_location=gs://{0}/staging/'.format(bucket),
      '--temp_location=gs://{0}/staging/'.format(bucket),
      '--runner=DataflowRunner',
      '--template_location:gs://{0}/model/'.format(bucket)  ]
    
    p = beam.Pipeline(argv=argv)
    (p
            | 'Procura arquivo' >> beam.io.fileio.MatchFiles('gs://projetofinalscacademy/dadosbrutos/pessoas/acidentes20*.zip')
            | 'Encontra os targets' >> beam.io.fileio.ReadMatches() 
            | 'Unzipa' >> beam.FlatMap(unzip)
            | 'Dataframe'>> beam.FlatMap(dataframe)
            | 'Dicionario'>>beam.FlatMap(lambda df: df.to_dict('records'))
            |  'Json'>>beam.Map(json.dumps)
            | 'Escreve arquivo' >> beam.io.WriteToText('gs://projetofinalscacademy/dadosbrutos/pessoas/acidente_pessoas',file_name_suffix='.json'))
    p.run()

In [5]:
run()



In [6]:
from apache_beam.runners.interactive import interactive_runner
import apache_beam.runners.interactive.interactive_beam as ib

options = pipeline_options.PipelineOptions()


def unzip(readable_file):
    input_zip=zipfile.ZipFile(readable_file.open('r'))
    a=input_zip.namelist()
    b=input_zip.read(a[0])
    d = json.loads(b.decode("iso-8859-1"))
    yield d
def dataframe(readable_file):
    df=pd.DataFrame.from_dict(readable_file)
    yield df
    
p = beam.Pipeline(interactive_runner.InteractiveRunner(), options=options)
(p
        | 'Procura arquivo' >> beam.io.fileio.MatchFiles('DataSet/Pessoas/acidentes2021.zip')
        | 'Encontra os targets' >> beam.io.fileio.ReadMatches() 
        | 'Unzipa' >> beam.FlatMap(unzip)
        | 'Dataframe'>> beam.FlatMap(dataframe)
        | 'Dicionario'>>beam.FlatMap(lambda df: df.to_dict('records'))
        |  'Json'>>beam.Map(json.dumps)
        | 'Escreve arquivo' >> beam.io.WriteToText('pessoas', file_name_suffix='.json'))



<PCollection[[6]: Escreve arquivo/Write/WriteImpl/FinalizeWrite.None] at 0x7f1ff3fc4710>

In [7]:
ib.show_graph(p)