In [1]:
import os
import pandas as pd

In [9]:
def concatenate_tacited_files(directory: str, theme: str, output_file: str):
    """
    Concatenates specified columns from all xlsx files in a directory and its subdirectories.
    :param directory: Path to the directory containing xlsx files.
    :param theme: The string to replace the "Human Annotated Essay" column header.
    :param output_file: Path to the output file.
    """
    
    # Initialize an empty list to store DataFrames
    all_data = []
    
    # Traverse the directory and subdirectories to find all xlsx files
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xlsx") and "Essay1" in file:
                file_path = os.path.join(subdir, file)
                
                try:
                    # Read the xlsx file without specifying columns initially
                    df = pd.read_excel(file_path, engine='openpyxl', header=0)

                    # Check if required columns exist, otherwise skip this file
                    required_columns = ["Essay ID", "Annotated Essays", "Human Annotated Essay"]
                    if not all(col in df.columns for col in required_columns):
                        print(f"Skipping file {file_path}: Required columns not found.")
                        continue

                    # Select the required columns
                    df = df[["Essay ID", "Annotated Essays", "Human Annotated Essay"]]
                    
                    # Rename columns
                    df.columns = ["Essay ID", "Annotated Essays", theme]

                    # Skip row 1 for all files except the first one
                    if len(all_data) > 0:
                        df = df.iloc[1:]

                    # Append the dataframe to the list
                    all_data.append(df)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    
    # Concatenate all dataframes
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        
        # Save the concatenated DataFrame to a new xlsx file
        final_df.to_excel(output_file, index=False)
        print(f"Concatenation complete. Output saved to {output_file}")
    else:
        print("No valid files found to concatenate.")

In [2]:
import os
import pandas as pd

def process_essay_files_to_dict(directory: str):
    """
    Processes all xlsx files with "Essay1" in the filename, creating a dictionary of DataFrames for each theme except Navigational.
    Keeps 'Essay ID', 'Annotated Essay', and 'Human Annotated Essay' while modifying the 'Human Annotated Essay' column.

    :param directory: Path to the directory containing xlsx files.
    :return: A dictionary where keys are themes (excluding Navigational), and values are DataFrames.
    """
    # Theme definitions: Theme name and corresponding substring to search for
    themes = {
        "Attainment": "att",
        "Aspirational": "asp",
        "Familial": "fam",
        "Filial Piety": "fil",
        # "Navigational": "nav",  # Exclude Navigational
        "Community Consciouss": "com",
        "Social": "soc",
        "Spiritual": "spiri",
        "Resistance": "resist",
        "Perseverance": "pers",
        "First Generation": "first"
    }

    # Initialize a dictionary to store theme-specific DataFrames
    theme_dataframes = {}

    # Traverse the directory and process only valid files
    for subdir, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".xlsx") and "Essay1" in file:
                file_path = os.path.join(subdir, file)

                try:
                    # Load the xlsx file
                    df = pd.read_excel(file_path, engine="openpyxl")
                    
                    # Check for required columns
                    required_columns = ["Essay ID", "Annotated Essays", "Human Annotated Essay", "Specific Theme(s)"]
                    if not all(col in df.columns for col in required_columns):
                        print(f"Skipping {file}: Required columns not found.")
                        continue
                    
                    print(f"Processing {file}...")

                    # Process each theme
                    for theme, substring in themes.items():
                        # Create a copy of the relevant columns
                        theme_df = df[["Essay ID", "Annotated Essays", "Human Annotated Essay", "Specific Theme(s)"]].copy()

                        # Modify the "Human Annotated Essay" column
                        theme_df["Human Annotated Essay"] = theme_df.apply(
                            lambda row: row["Human Annotated Essay"]
                            if pd.notna(row["Specific Theme(s)"]) and substring in str(row["Specific Theme(s)"]).lower()
                            else 0,
                            axis=1
                        )

                        # Drop the "Specific Theme(s)" column as it's no longer needed
                        theme_df = theme_df.drop(columns=["Specific Theme(s)"])

                        # Append to the theme-specific DataFrame in the dictionary
                        if theme not in theme_dataframes:
                            theme_dataframes[theme] = theme_df
                        else:
                            theme_dataframes[theme] = pd.concat([theme_dataframes[theme], theme_df], ignore_index=True)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

    return theme_dataframes

In [19]:
theme = "Navigational"
directory = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/tacited_validated_to_be_processed/" + theme

theme =  theme + " Plus"
base_output_file = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/batch_2_concatenated/"
output_file = base_output_file + f"batch_2_tacited_{theme}_to_be_processed_for_sentence_level.xlsx"

concatenate_tacited_files(directory, theme, output_file)

Concatenation complete. Output saved to /Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/batch_2_concatenated/batch_2_tacited_Navigational Plus_to_be_processed_for_sentence_level.xlsx


In [3]:
directory = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/tacited_validated_to_be_processed/"
theme_dataframes = process_essay_files_to_dict(directory)

Processing Fall2020_PHYS0102-02_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0242-04_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_ASTR0116-02_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0222-02_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0222-12_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0222-06_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0122-04_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0122-08_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0122-01_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0112-10_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0112-05_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0112-02_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0112-03_Essay1_anonymized_Familial_tacited.xlsx...
Processing Fall2020_PHYS0

In [4]:
theme_dataframes["Attainment"]

Unnamed: 0,Essay ID,Annotated Essays,Human Annotated Essay
0,F20.PHYS0102.02.000.001,Reflection #1: Why Am I Here? “Why am I here...,0.0
1,F20.PHYS0102.02.001.002,I am here because my future goal is to heal pe...,0.0
2,F20.PHYS0102.02.002.003,Reflection: At the moment I am feeling very st...,0.0
3,F20.PHYS0102.02.003.004,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",0.0
4,F20.PHYS0102.02.004.005,Karla Martinez Why I am here? One of the mai...,0.0
...,...,...,...
7137,S20.PHYS0112.01.019.312,"I am here to learn how physics works, how moti...","I am here to learn how physics works, how moti..."
7138,S20.PHYS0112.01.020.313,I am here because there are so many things to ...,I am here to be the best or at least a better ...
7139,S20.PHYS0112.01.021.054,I am taking physics and physics lab because I ...,0
7140,S20.PHYS0112.01.022.314,Sometimes when things in life don't go as plan...,0


In [5]:
theme_dataframes["Aspirational"]

Unnamed: 0,Essay ID,Annotated Essays,Human Annotated Essay
0,F20.PHYS0102.02.000.001,Reflection #1: Why Am I Here? “Why am I here...,0.0
1,F20.PHYS0102.02.001.002,I am here because my future goal is to heal pe...,0.0
2,F20.PHYS0102.02.002.003,Reflection: At the moment I am feeling very st...,0.0
3,F20.PHYS0102.02.003.004,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",0.0
4,F20.PHYS0102.02.004.005,Karla Martinez Why I am here? One of the mai...,0.0
...,...,...,...
7137,S20.PHYS0112.01.019.312,"I am here to learn how physics works, how moti...","I am here to learn how physics works, how moti..."
7138,S20.PHYS0112.01.020.313,I am here because there are so many things to ...,I am here to be the best or at least a better ...
7139,S20.PHYS0112.01.021.054,I am taking physics and physics lab because I ...,I am taking physics and physics lab because I ...
7140,S20.PHYS0112.01.022.314,Sometimes when things in life don't go as plan...,0


In [32]:
def concatenate_theme_dataframes(theme_dataframes):
    """
    Efficiently concatenates theme-specific DataFrames into a single DataFrame using pd.concat.
    Appends 'Essay ID' and 'Annotated Essay' as the base and adds theme-specific columns.

    :param theme_dataframes: Dictionary of DataFrames for each theme.
    :return: A single concatenated DataFrame.
    """
    # Extract 'Essay ID' and 'Annotated Essay' from the first DataFrame as the base
    temp_base_df = theme_dataframes["Aspirational"]
    base_df = temp_base_df[["Essay ID", "Annotated Essays"]]


    # Prepare theme-specific DataFrames by renaming 'Human Annotated Essay' to the theme name
    theme_columns = [
        df[["Human Annotated Essay"]].rename(columns={"Human Annotated Essay": theme}).reset_index(drop=True)
        for theme, df in theme_dataframes.items()
    ]

    all_dataframes = []

    # Append base_df as the first element in the list of DataFrames
    all_dataframes.append(base_df)
    for i in theme_columns:
        all_dataframes.append(i)

    # Concatenate all DataFrames horizontally
    final_df = pd.concat(all_dataframes, axis=1)

    return final_df

In [38]:
themes = list(theme_dataframes.keys())
themes

['Attainment',
 'Aspirational',
 'Familial',
 'Filial Piety',
 'Community Consciouss',
 'Social',
 'Spiritual',
 'Resistance',
 'Perseverance',
 'First Generation']

In [33]:
# Assuming `theme_dataframes` is the dictionary of theme-specific DataFrames
final_df = concatenate_theme_dataframes(theme_dataframes)
final_df

Unnamed: 0,Essay ID,Annotated Essays,Attainment,Aspirational,Familial,Filial Piety,Community Consciouss,Social,Spiritual,Resistance,Perseverance,First Generation
0,F20.PHYS0102.02.000.001,Reflection #1: Why Am I Here? “Why am I here...,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0
1,F20.PHYS0102.02.001.002,I am here because my future goal is to heal pe...,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0
2,F20.PHYS0102.02.002.003,Reflection: At the moment I am feeling very st...,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0
3,F20.PHYS0102.02.003.004,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",0.0,0.0,"In general, I’m in school because I want to m...","In general, I’m in school because I want to m...",0.0,0.0,0.0,0,0,"In general, I’m in school because I want to m..."
4,F20.PHYS0102.02.004.005,Karla Martinez Why I am here? One of the mai...,0.0,0.0,One of the main reasons why I am here is becau...,One of the main reasons why I am here is becau...,0.0,0.0,0.0,0,0,One of the main reasons why I am here is becau...
...,...,...,...,...,...,...,...,...,...,...,...,...
7137,S20.PHYS0112.01.019.312,"I am here to learn how physics works, how moti...","I am here to learn how physics works, how moti...","I am here to learn how physics works, how moti...",0,0,0,0,0,0,0,0
7138,S20.PHYS0112.01.020.313,I am here because there are so many things to ...,I am here to be the best or at least a better ...,I am here to be the best or at least a better ...,0,0,0,0,0,0,0,0
7139,S20.PHYS0112.01.021.054,I am taking physics and physics lab because I ...,0,I am taking physics and physics lab because I ...,0,0,0,0,0,0,0,0
7140,S20.PHYS0112.01.022.314,Sometimes when things in life don't go as plan...,0,0,0,0,0,0,0,0,0,0


In [39]:
final_df.to_excel("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/for_training/batch_2_concatenated/batch_2_all_11_themes_to_be_processed_for_sentence_level.xlsx", index=False)