# GroupBy

Single Key, Multiple Keys

#### Create a subset and group by, aggregating the row indexes
We want to maintain the original indices of the data coming in if we can.  

In [2]:
import apache_beam as beam
from apache_beam.dataframe.io import read_csv
from apache_beam.dataframe.convert import to_dataframe
from apache_beam.dataframe.convert import to_pcollection

import os

# Configure some test data
test_file_path = os.path.join(os.getcwd(), 'assets', 'data', 'usbe_students.csv')
output_file_path = os.path.join(os.getcwd(), 'output_test_file.csv')


# Define a subset to work with
agg_set = ['FIRST_NAME', 'LAST_NAME']  # Can be any number of valid columns in dataset

## Via Pure CombineFn, Pipeline Operations

In [3]:
from utils.load import (
    make_schema_from_csv,
    make_csv_coder,
)
from assets.mapping import is_mapped

class CompositeKey(beam.DoFn):
    def __init__(self, agg_keys: list = []):
        self.agg_keys = agg_keys
        
    def process(self, row: dict):
        row['agg_key'] = {'-'.join([row[k] for k in self.agg_keys])}
        yield row
    

# Convert over to mapped column names
mapped_set = [is_mapped(col)[0] for col in agg_set]
    

with beam.Pipeline() as p:
    schema = make_schema_from_csv(test_file_path)
    csv_coder = make_csv_coder(schema)
    
    grouped = (
        p
        | 'ReadFromText' >> beam.io.ReadFromText(
            test_file_path,
            skip_header_lines=1
            )
        | 'ParseCSV' >> beam.Map(csv_coder)
        | 'Create Composite Key' >> beam.ParDo(CompositeKey(agg_keys=mapped_set))
        | 'GroupBy Keys' >> beam.GroupBy(lambda r: r['agg_key'])  # Now grouped by composite key
            # If there was a row-index column, then you can aggregate the columns via CombinePerKey
#         | 'Print' >> beam.Map(print)
    )

## Via DataFrames API
Beam DataFrames offer more table-like behaviors.  Some functionality hasn't been implemented yet (apply won't work)

In [4]:
# Build the pipeline
with beam.Pipeline() as p:
    # Use the runner to load data and put it into a dataframe (there are pcollection to frame commands as well)
    df = p | read_csv(test_file_path)
    
    aggregated = df[agg_set]\
        .groupby(agg_set)\
        .count()
#         .apply(lambda x: x.index.to_list())
    
    # Output the frame directly (can go to csv or BigQuery)
    aggregated.to_csv(output_file_path)  # Needs more kwargs to be useful, but output is possible already

    # Convert back to PCollection for more stuff if needed - in general, don't do this. Very slow.
#     agg_pc = to_pcollection(aggregated)  

  return pcoll | fileio.WriteToFiles(
  p.options.view_as(GoogleCloudOptions).temp_location or
