In [1]:
import os
import pandas as pd

def get_unique_columns(root_folder):
    """
    Recursively scans all Excel files in the specified root folder and its subdirectories
    and collects unique column names across all files.

    Parameters:
    - root_folder (str): Path to the parent directory containing theme folders.

    Returns:
    - set: A set of unique column names from all Excel files.
    """
    unique_columns = set()

    # Walk through all directories and subdirectories
    for dirpath, _, filenames in os.walk(root_folder):
        for file in filenames:
            if file.endswith('.xlsx'):  # Modify to include '.xls' if needed
                file_path = os.path.join(dirpath, file)
                
                try:
                    df = pd.read_excel(file_path, engine='openpyxl', nrows=1)  # Read only header row
                    unique_columns.update(df.columns)  # Add new column names to the set
                except Exception as e:
                    print(f"Error reading {file}: {e}")

    return unique_columns

# Example usage
root_folder = "/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/"
unique_columns = get_unique_columns(root_folder)

# Print all unique column names
print("Unique columns found in all Excel files:")
for col in sorted(unique_columns):  # Sorted for readability
    print(col)

KeyboardInterrupt: 

In [1]:
import os
import pandas as pd

# Define the root folder where all theme folders exist
root_folder = "/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/"

# Define expected theme columns
theme_columns = ["Aspirational", "Familial", "Social", "Navigational", "Resistance"]

# Define the themes and their corresponding folder names
theme_mapping = {
    "Aspirational": "Aspirational",
    "Familial": "Familial",
    "Social": "Social",
    "Navigational": "Navigational",
    "Resistance": "Resistance"
}

# Initialize an empty list to store dataframes
df_list = []

# Walk through all theme folders and subfolders
for dirpath, _, filenames in os.walk(root_folder):
    for file in filenames:
        if file.endswith('.xlsx'):  # Check for Excel files
            file_path = os.path.join(dirpath, file)
            
            # Identify the theme based on folder structure
            theme = None
            for key, folder_name in theme_mapping.items():
                if folder_name in dirpath:
                    theme = key
                    break  # Stop checking once theme is found
            
            if not theme:
                print(f"Skipping {file_path} (No matching theme found)")
                continue  # Skip files that don't belong to a theme

            try:
                # Read Excel file (only relevant columns)
                df = pd.read_excel(file_path, engine="openpyxl", usecols=[
                    "Alma ID", "Essay ID", "Annotated Essays", "Reconciled Annotations"
                ])

                # Ensure relevant columns exist
                df.rename(columns={"Reconciled Annotations": theme}, inplace=True)  # Rename column to match theme
                
                # Keep only necessary columns
                df = df[["Alma ID", "Essay ID", "Annotated Essays", theme]]

                df_list.append(df)

            except Exception as e:
                print(f"Error processing {file}: {e}")

# Merge all extracted dataframes on "Annotated Essays"
if df_list:
    # Concatenate all dataframes
    merged_df = pd.concat(df_list, ignore_index=True)

    # Group by 'Annotated Essays' and merge themes
    final_df = merged_df.groupby("Annotated Essays").first().reset_index()

    # Ensure all theme columns exist
    for col in theme_columns:
        if col not in final_df.columns:
            final_df[col] = None  # Fill missing themes with NaN
    
    # Reorder columns
    final_df = final_df[["Alma ID", "Essay ID", "Annotated Essays"] + theme_columns]

    # Save merged output
    save_path = os.path.join(root_folder, "merged_reconciled_annotations_complete.xlsx")
    final_df.to_excel(save_path, index=False, engine="openpyxl")

    print(f"Successfully merged {len(df_list)} files! Saved at:\n{save_path}")

else:
    print("No valid Excel files were found.")

Skipping /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/batch_2_all_supercapitals_wout_nav.xlsx (No matching theme found)
Skipping /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/batch_2_merge_reconciliations_wout_nav.xlsx (No matching theme found)
Skipping /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/merged_reconciled_annotations_complete.xlsx (No matching theme found)
Skipping /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/batch1_compiled.xlsx (No matching theme found)
Successfully merged 323 files! Saved at:
/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_reina_to_be_reconciled/merged_reconciled_annotations_complete.xlsx
