## Concatenate Parquet Files

This script processes and concatenates Parquet files from a specified directory into consolidated tables and saves them in a new directory.

1. The __get_table_name__ function is designed to extract the base table name from a given Parquet file name using a regular expression.
2. The __concat_parquet_files__ function is designed to read multiple Parquet files from a specified directory structure, group them by their base table names, and concatenate them into single DataFrames for each table.
3. The concatenated DataFrames are being stored in Parquet files to assist in answering the questions.


In [1]:
import os
import pandas as pd
import re

def get_table_name(file_name):
    # Regex to match the table name, capturing everything before the last underscore followed by digits
    match = re.match(r"(.+)_\d+\.parquet$", file_name)
    if match:
        return match.group(1)
    else:
        raise ValueError(f"Filename {file_name} does not match expected pattern")

def concat_parquet_files(base_dir):
    table_data = {}
    # Walk through the extracted files
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".parquet"):
                table_name = get_table_name(file)  # Get the table name using the regex function
                file_path = os.path.join(root, file)
                if table_name not in table_data:
                    table_data[table_name] = []
                table_data[table_name].append(file_path)

    # Concatenate parquet files for each table
    concatenated_tables = {}
    for table_name, file_paths in table_data.items():
        dfs = [pd.read_parquet(fp) for fp in file_paths]
        concatenated_tables[table_name] = pd.concat(dfs, ignore_index=True)

    return concatenated_tables

# Concatenate parquet files
extraction_directory = os.path.join("..", "data", "interim","extracted_data")
concatenated_directory = os.path.join("..", "data", "processed","concatenated_data")
tables = concat_parquet_files(extraction_directory)

# Save concatenated tables
for table_name, df in tables.items():
    df.to_parquet(f'{concatenated_directory}/{table_name}_concatenated.parquet')

print("Concatenation complete.")

  concatenated_tables[table_name] = pd.concat(dfs, ignore_index=True)
  concatenated_tables[table_name] = pd.concat(dfs, ignore_index=True)


Concatenation complete.
