In [31]:
import os
import pandas as pd

def combine_essays(directory_path, essay_name, output_file):
    """
    Combines specified essay files into a single spreadsheet with multiple sheets.
    
    Parameters:
    - directory_path: str, the root directory containing essays.
    - essay_name: str, the essay name to look for (e.g., "Essay2").
    - output_file: str, the name of the output Excel file.
    
    Returns:
    - str: The path to the generated spreadsheet.
    """
    # Initialize an Excel writer
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        # Walk through the directory
        for root, _, files in os.walk(directory_path):
            for file in files:
                if file.endswith(".xlsx") and essay_name in file:
                    # Extract Semester Year and Course Name from the directory path
                    parts = os.path.normpath(root).split(os.sep)
                    if len(parts) >= 3:  # Ensure we have enough parts in the path
                        semester_year = parts[-3]
                        course_name = parts[-2]
                        sheet_name = f"{semester_year}_{course_name}"
                        
                        # Read the Excel file
                        file_path = os.path.join(root, file)
                        try:
                            df = pd.read_excel(file_path)
                            # Add the data to the output file as a new sheet
                            df.to_excel(writer, sheet_name=sheet_name[:31], index=False)
                        except Exception as e:
                            print(f"Error processing file {file_path}: {e}")

    return output_file

# # Example usage
# if __name__ == "__main__":
#     directory = "path_to_root_directory"  # Replace with the root directory path
#     essay = "Essay2"  # Replace with the desired essay name
#     output = "combined_essays.xlsx"  # Name of the output file
#     result_path = combine_essays(directory, essay, output)
#     print(f"Combined spreadsheet created at: {result_path}")

In [1]:
import os
import pandas as pd

def combine_essays(directory_path, essay_name):
    """
    Combines specified essay files into a single spreadsheet with multiple sheets.
    
    Parameters:
    - directory_path: str, the root directory containing essays.
    - essay_name: str, the essay name to look for (e.g., "Essay2").
    
    Returns:
    - str: The path to the generated spreadsheet.
    """
    # Extract the Theme and Semester Years for the output file name
    theme = os.path.basename(os.path.normpath(directory_path))
    semester_years = set()

    # Temporary file list to hold data and sheet names
    dataframes = []
    sheet_names = []

    # Walk through the directory
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".xlsx") and essay_name in file:
                # Extract Semester Year from the directory path
                parts = os.path.normpath(root).split(os.sep)
                if len(parts) >= 4:  # Ensure we have enough parts in the path
                    semester_year = parts[-4]

                    # Extract the course name from the file name
                    course_name = file.split("_Essay")[0]

                    # Track semester years
                    semester_years.add(semester_year)

                    # Read the Excel file
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_excel(file_path)
                        # Add data and sheet name to the lists
                        dataframes.append(df)
                        sheet_names.append(course_name[:31])  # Limit sheet name to 31 characters
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")

    # Generate the output file name
    sorted_years = "_".join(sorted(semester_years))
    output_file = f"{sorted_years}_{essay_name}_Combined.xlsx"

    # Write all dataframes to the output Excel file
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        for df, sheet_name in zip(dataframes, sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    return output_file

# # Example usage
# if __name__ == "__main__":
#     directory = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/tacited_data_fa19_sp20_fa20/Perseverance"
#     essay = "Essay2"  # Replace with the desired essay name
#     result_path = combine_essays(directory, essay)
#     print(f"Combined spreadsheet created at: {result_path}")

In [2]:
import os
import pandas as pd

def combine_essays(directory_path, essay_name):
    """
    Combines specified essay files into a single spreadsheet with multiple sheets.
    
    Parameters:
    - directory_path: str, the root directory containing essays.
    - essay_name: str, the essay name to look for (e.g., "Essay2").
    
    Returns:
    - str: The path to the generated spreadsheet.
    """
    # Extract the Theme and Semester Years for the output file name
    theme = os.path.basename(os.path.normpath(directory_path))
    semester_years = set()

    # Temporary file list to hold data and sheet names
    dataframes = []
    sheet_names = []

    # Walk through the directory
    for root, _, files in os.walk(directory_path):
        for file in files:
            if (file.endswith(".xlsx") or file.endswith(".csv")) and essay_name in file:
                # Extract Semester Year from the directory path
                parts = os.path.normpath(root).split(os.sep)
                if len(parts) >= 4:  # Ensure we have enough parts in the path
                    semester_year = parts[-4]

                    # Extract the course name from the file name
                    course_name = file.split("_Essay")[0]

                    # Track semester years
                    semester_years.add(semester_year)

                    # Read the file based on its extension
                    file_path = os.path.join(root, file)
                    try:
                        if file.endswith(".xlsx"):
                            df = pd.read_excel(file_path)
                        elif file.endswith(".csv"):
                            df = pd.read_csv(file_path)
                        else:
                            continue
                        
                        # Add data and sheet name to the lists
                        dataframes.append(df)
                        sheet_names.append(course_name[:31])  # Limit sheet name to 31 characters
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")

    # Generate the output file name
    sorted_years = "_".join(sorted(semester_years))
    output_file = f"{sorted_years}_{essay_name}_Combined.xlsx"

    # Write all dataframes to the output Excel file
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        for df, sheet_name in zip(dataframes, sheet_names):
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    return output_file

# # Example usage
# if __name__ == "__main__":
#     directory = "/path/to/your/directory"
#     essay = "Essay2"  # Replace with the desired essay name
#     result_path = combine_essays(directory, essay)
#     print(f"Combined spreadsheet created at: {result_path}")

In [4]:
directory = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/to_be_tacited/batch_2"
essay = "Essay5"  # Replace with the desired essay name
result_path = combine_essays(directory, essay)
print(f"Combined spreadsheet created at: {result_path}")

Combined spreadsheet created at: batch_2_Essay5_Combined.xlsx
