In [1]:
import os
import pandas as pd

In [9]:
def concatenate_tacited_files(directory: str, theme: str, output_file: str):
    """
    Concatenates specified columns from all xlsx files in a directory and its subdirectories.
    :param directory: Path to the directory containing xlsx files.
    :param theme: The string to replace the "Human Annotated Essay" column header.
    :param output_file: Path to the output file.
    """
    
    # Initialize an empty list to store DataFrames
    all_data = []
    
    # Traverse the directory and subdirectories to find all xlsx files
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xlsx") and "Essay1" in file:
                file_path = os.path.join(subdir, file)
                
                try:
                    # Read the xlsx file without specifying columns initially
                    df = pd.read_excel(file_path, engine='openpyxl', header=0)

                    # Check if required columns exist, otherwise skip this file
                    required_columns = ["Essay ID", "Annotated Essays", "Human Annotated Essay"]
                    if not all(col in df.columns for col in required_columns):
                        print(f"Skipping file {file_path}: Required columns not found.")
                        continue

                    # Select the required columns
                    df = df[["Essay ID", "Annotated Essays", "Human Annotated Essay"]]
                    
                    # Rename columns
                    df.columns = ["Essay ID", "Annotated Essays", theme]

                    # Skip row 1 for all files except the first one
                    if len(all_data) > 0:
                        df = df.iloc[1:]

                    # Append the dataframe to the list
                    all_data.append(df)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    
    # Concatenate all dataframes
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        
        # Save the concatenated DataFrame to a new xlsx file
        final_df.to_excel(output_file, index=False)
        print(f"Concatenation complete. Output saved to {output_file}")
    else:
        print("No valid files found to concatenate.")

In [19]:
theme = "Navigational"
directory = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/tacited_validated_to_be_processed/" + theme

theme =  theme + " Plus"
base_output_file = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/batch_2_concatenated/"
output_file = base_output_file + f"batch_2_tacited_{theme}_to_be_processed_for_sentence_level.xlsx"

concatenate_tacited_files(directory, theme, output_file)

Concatenation complete. Output saved to /Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/batch_2_concatenated/batch_2_tacited_Navigational Plus_to_be_processed_for_sentence_level.xlsx
