In [10]:
# Example usage
file_gian = "/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/sample_data_for_reconciliation/gian/Aspirational/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_Aspirational_tacited.xlsx"
file_reina = "/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/sample_data_for_reconciliation/reina/Aspirational/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_Aspirational_tacited.xlsx"
output_file = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/src/data_preprocessing/reconciliation_jan2025/reconciled.xlsx"

In [1]:
pwd

'/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/src/data_preprocessing/reconciliation_jan2025'

In [3]:
import pandas as pd
from openpyxl import load_workbook
from fuzzywuzzy import fuzz

def reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file, similarity_threshold=80):
    """
    Reconcile annotations between two annotators with loose matching and preserve clean structure.

    Parameters:
    - file_gian (str): Path to Gian's Excel file.
    - file_reina (str): Path to Reina's Excel file.
    - output_file (str): Path to save the prepared file.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    # Load data from both files
    df_gian = pd.read_excel(file_gian)
    df_reina = pd.read_excel(file_reina)

    # Common shared columns
    shared_columns = [
        'Essay ID', 'Year', 'Semester', 'Class', 'Type', 
        'Section', 'Alma ID', 'Aspirational Present', 'Annotated Essays'
    ]
    
    # Merge files on shared columns and "ID"
    merged_df = pd.merge(
        df_gian[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Gian',
                'Specific Theme(s)': 'Specific Theme(s)_Gian'
            }
        ),
        df_reina[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Reina',
                'Specific Theme(s)': 'Specific Theme(s)_Reina'
            }
        ),
        on=shared_columns
    )

    # Create a new DataFrame for the output
    output_df = merged_df[shared_columns].copy()

    # Add annotation and theme columns
    output_df['Gian Annotations'] = merged_df['Human Annotated Essay_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Annotations'] = merged_df['Human Annotated Essay_Reina'].astype(str).str.strip().str.lower()
    output_df['Gian Specific Themes'] = merged_df['Specific Theme(s)_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Specific Themes'] = merged_df['Specific Theme(s)_Reina'].astype(str).str.strip().str.lower()

    # Initialize additional reconciliation columns
    output_df["Gian Notes"] = ""
    output_df["Reina Notes"] = ""
    output_df['Reconciled Yes or No'] = ""
    output_df['Reconciled Annotations'] = ""
    output_df['Reconciled Specific Themes'] = ""

   
    # Define a function for loose matching
    def loose_match(gian_annotation, reina_annotation):
        # Ensure both are strings
        gian_annotation = str(gian_annotation)
        reina_annotation = str(reina_annotation)
        if pd.isna(gian_annotation) or pd.isna(reina_annotation):
            return False
        # Use fuzzy matching
        return fuzz.ratio(gian_annotation, reina_annotation) >= similarity_threshold

    # Reconcile annotations with loose matching
    output_df['Reconciled Yes or No'] = merged_df.apply(
        lambda row: 'Yes' if loose_match(row['Human Annotated Essay_Gian'], row['Human Annotated Essay_Reina']) else 'No', axis=1
    )

    output_df['Reconciled Annotations'] = merged_df.apply(
        lambda row: row['Human Annotated Essay_Gian'] if loose_match(row['Human Annotated Essay_Gian'], row['Human Annotated Essay_Reina']) else "X", axis=1
    )

    output_df['Reconciled Specific Themes'] = merged_df.apply(
        lambda row: row['Specific Theme(s)_Gian'] if loose_match(row['Specific Theme(s)_Gian'], row['Specific Theme(s)_Reina']) else "X", axis=1
    )

    return output_df

    # Save the reconciled data to a new Excel file
    output_df.to_excel(output_file, index=False)
    print(f"Reconciled file saved at: {output_file}")

reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file)

NameError: name 'file_gian' is not defined

In [2]:
import pandas as pd
from fuzzywuzzy import fuzz
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

def reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file, similarity_threshold=80):
    """
    Reconcile annotations between two annotators with loose matching and apply conditional formatting.

    Parameters:
    - file_gian (str): Path to Gian's Excel file.
    - file_reina (str): Path to Reina's Excel file.
    - output_file (str): Path to save the prepared file.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    # Load data from both files
    df_gian = pd.read_excel(file_gian)
    df_reina = pd.read_excel(file_reina)

    # Common shared columns
    shared_columns = [
        'Essay ID', 'Year', 'Semester', 'Class', 'Type', 
        'Section', 'Alma ID', 'Aspirational Present', 'Annotated Essays'
    ]
    
    # Merge files on shared columns
    merged_df = pd.merge(
        df_gian[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Gian',
                'Specific Theme(s)': 'Specific Theme(s)_Gian'
            }
        ),
        df_reina[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Reina',
                'Specific Theme(s)': 'Specific Theme(s)_Reina'
            }
        ),
        on=shared_columns
    )

    # Create a new DataFrame for the output
    output_df = merged_df[shared_columns].copy()

    # Add annotation and theme columns
    output_df['Gian Annotations'] = merged_df['Human Annotated Essay_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Annotations'] = merged_df['Human Annotated Essay_Reina'].astype(str).str.strip().str.lower()
    output_df['Gian Specific Themes'] = merged_df['Specific Theme(s)_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Specific Themes'] = merged_df['Specific Theme(s)_Reina'].astype(str).str.strip().str.lower()

    # Initialize additional reconciliation columns with empty strings
    output_df['Reconciled Yes or No'] = ""
    output_df['Reconciled Annotations'] = ""
    output_df['Reconciled Specific Themes'] = ""

    # Define a function for loose matching
    def loose_match(annotation1, annotation2):
        # Ensure both annotations are valid strings
        annotation1 = str(annotation1) if not pd.isna(annotation1) else ""
        annotation2 = str(annotation2) if not pd.isna(annotation2) else ""
        return fuzz.ratio(annotation1, annotation2) >= similarity_threshold

    
    # Perform reconciliation
    for index, row in merged_df.iterrows():
        gian_annotation = row['Human Annotated Essay_Gian']
        reina_annotation = row['Human Annotated Essay_Reina']
        specific_theme_gian = row['Specific Theme(s)_Gian']
        specific_theme_reina = row['Specific Theme(s)_Reina']

        # Apply loose matching
        is_match = loose_match(gian_annotation, reina_annotation)
        is_specific_theme_match = loose_match(specific_theme_gian, specific_theme_reina)
        output_df.loc[index, 'Reconciled Yes or No'] = 'Yes' if is_match else ""
        output_df.loc[index, 'Reconciled Annotations'] = gian_annotation if is_match else ""
        output_df.loc[index, 'Reconciled Specific Themes'] = specific_theme_gian if is_specific_theme_match else ""


    # Save the reconciled data to an Excel file and apply conditional formatting
    wb = Workbook()
    ws = wb.active
    ws.title = "Reconciled Data"

    # Write the DataFrame to the worksheet
    for r_idx, row in enumerate(dataframe_to_rows(output_df, index=False, header=True), start=1):
        for c_idx, value in enumerate(row, start=1):
            ws.cell(row=r_idx, column=c_idx, value=value)

            # Apply red background to unmatched rows (Reconciled Yes or No is empty)
            if r_idx > 1 and ws.cell(row=r_idx, column=output_df.columns.get_loc('Reconciled Yes or No') + 1).value == "":
                for col in range(1, len(output_df.columns) + 1):
                    ws.cell(row=r_idx, column=col).fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")

    # Save the workbook
    wb.save(output_file)
    print(f"Reconciled file with conditional formatting saved at: {output_file}")

reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file)

NameError: name 'file_gian' is not defined

In [12]:
import pandas as pd
from fuzzywuzzy import fuzz
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Alignment, Border, Side, Font
from openpyxl.utils.dataframe import dataframe_to_rows

def reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file, similarity_threshold=80):
    """
    Reconcile annotations between two annotators with loose matching and apply conditional formatting.

    Parameters:
    - file_gian (str): Path to Gian's Excel file.
    - file_reina (str): Path to Reina's Excel file.
    - output_file (str): Path to save the prepared file.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    # Load data from both files
    df_gian = pd.read_excel(file_gian)
    df_reina = pd.read_excel(file_reina)

    # Common shared columns
    shared_columns = [
        'Essay ID', 'Year', 'Semester', 'Class', 'Type', 
        'Section', 'Alma ID', 'Aspirational Present', 'Annotated Essays'
    ]
    
    # Merge files on shared columns
    merged_df = pd.merge(
        df_gian[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Gian',
                'Specific Theme(s)': 'Specific Theme(s)_Gian'
            }
        ),
        df_reina[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Reina',
                'Specific Theme(s)': 'Specific Theme(s)_Reina'
            }
        ),
        on=shared_columns
    )

    # Create a new DataFrame for the output
    output_df = merged_df[shared_columns].copy()

    # Add annotation and theme columns
    output_df['Gian Annotations'] = merged_df['Human Annotated Essay_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Annotations'] = merged_df['Human Annotated Essay_Reina'].astype(str).str.strip().str.lower()
    output_df['Gian Specific Themes'] = merged_df['Specific Theme(s)_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Specific Themes'] = merged_df['Specific Theme(s)_Reina'].astype(str).str.strip().str.lower()

    # Initialize additional reconciliation columns with empty strings
    output_df['Reconciled Yes or No'] = ""
    output_df['Reconciled Annotations'] = ""
    output_df['Reconciled Specific Themes'] = ""

    # Define a function for loose matching
    def loose_match(annotation1, annotation2):
        # Ensure both annotations are valid strings
        annotation1 = str(annotation1) if not pd.isna(annotation1) else ""
        annotation2 = str(annotation2) if not pd.isna(annotation2) else ""
        return fuzz.ratio(annotation1, annotation2) >= similarity_threshold

    # Perform reconciliation
    for index, row in merged_df.iterrows():
        gian_annotation = row['Human Annotated Essay_Gian']
        reina_annotation = row['Human Annotated Essay_Reina']
        specific_theme_gian = row['Specific Theme(s)_Gian']
        specific_theme_reina = row['Specific Theme(s)_Reina']

        # Check if annotations are empty
        both_empty = pd.isna(gian_annotation) and pd.isna(reina_annotation)
        both_match = loose_match(gian_annotation, reina_annotation)

        # Determine Reconciled Yes or No
        if both_empty:
            output_df.loc[index, 'Reconciled Yes or No'] = "No"
            output_df.loc[index, 'Reconciled Annotations'] = 0
            output_df.loc[index, 'Reconciled Specific Themes'] = 0
        elif both_match:
            output_df.loc[index, 'Reconciled Yes or No'] = "Yes"
            output_df.loc[index, 'Reconciled Annotations'] = gian_annotation
            output_df.loc[index, 'Reconciled Specific Themes'] = specific_theme_gian if loose_match(specific_theme_gian, specific_theme_reina) else 0
        else:
            output_df.loc[index, 'Reconciled Yes or No'] = "No"
            output_df.loc[index, 'Reconciled Annotations'] = 0
            output_df.loc[index, 'Reconciled Specific Themes'] = 0
            
    # Save the reconciled data to an Excel file and apply formatting
    wb = Workbook()
    ws = wb.active
    ws.title = "Reconciled Data"

    # Write the DataFrame to the worksheet
    for r_idx, row in enumerate(dataframe_to_rows(output_df, index=False, header=True), start=1):
        for c_idx, value in enumerate(row, start=1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            cell.alignment = Alignment(wrap_text=True)  # Enable text wrapping

            # Apply red background to unmatched rows
            if r_idx > 1 and output_df.loc[r_idx - 2, 'Reconciled Yes or No'] == "No":
                cell.fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")

    # Add borders and bold header row
    thin_border = Border(left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin"))
    for row in ws.iter_rows(min_row=1, max_row=1, max_col=len(output_df.columns)):
        for cell in row:
            cell.font = Font(bold=True)
            cell.border = thin_border

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=len(output_df.columns)):
        for cell in row:
            cell.border = thin_border

    # Save the workbook
    wb.save(output_file)
    print(f"Reconciled file with formatting saved at: {output_file}")

reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file)

Reconciled file with formatting saved at: /Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/src/data_preprocessing/reconciliation_jan2025/reconciled.xlsx


In [60]:
import pandas as pd
from fuzzywuzzy import fuzz
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Alignment, Border, Side, Font
from openpyxl.utils.dataframe import dataframe_to_rows

def reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file, similarity_threshold=80):
    """
    Reconcile annotations between two annotators with loose matching and apply conditional formatting.

    Parameters:
    - file_gian (str): Path to Gian's Excel file.
    - file_reina (str): Path to Reina's Excel file.
    - output_file (str): Path to save the prepared file.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    # Load data from both files
    df_gian = pd.read_excel(file_gian)
    df_reina = pd.read_excel(file_reina)

    # Common shared columns
    shared_columns = [
        'Essay ID', 'Year', 'Semester', 'Class', 'Type', 
        'Section', 'Alma ID', 'Aspirational Present', 'Annotated Essays'
    ]
    
    # Merge files on shared columns
    merged_df = pd.merge(
        df_gian[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)', 'Human Theme Presence (Yes or No)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Gian',
                'Specific Theme(s)': 'Specific Theme(s)_Gian',
                'Human Theme Presence (Yes or No)': 'Human Theme Presence (Yes or No)_Gian'
            }
        ),
        df_reina[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Reina',
                'Specific Theme(s)': 'Specific Theme(s)_Reina'
            }
        ),
        on=shared_columns
    )

    # Create a new DataFrame for the output
    output_df = merged_df[shared_columns].copy()

    # Add annotation and theme columns
    output_df['Gian Annotations'] = merged_df['Human Annotated Essay_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Annotations'] = merged_df['Human Annotated Essay_Reina'].astype(str).str.strip().str.lower()
    output_df['Gian Specific Themes'] = merged_df['Specific Theme(s)_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Specific Themes'] = merged_df['Specific Theme(s)_Reina'].astype(str).str.strip().str.lower()

    # Initialize additional reconciliation columns with empty strings
    output_df['Reconciled Yes or No'] = ""
    output_df['Reconciled Annotations'] = ""
    output_df['Reconciled Specific Themes'] = ""

    # Copy "Human Theme Presence (Yes or No)" from Gian's annotations
    output_df['Human Theme Presence (Yes or No)'] = merged_df['Human Theme Presence (Yes or No)_Gian']

    # Define a function for loose matching
    def loose_match(annotation1, annotation2):
        # Ensure both annotations are valid strings
        annotation1 = str(annotation1) if not pd.isna(annotation1) else ""
        annotation2 = str(annotation2) if not pd.isna(annotation2) else ""
        return fuzz.ratio(annotation1, annotation2) >= similarity_threshold

    # Perform reconciliation
    for index, row in merged_df.iterrows():
        gian_annotation = row['Human Annotated Essay_Gian']
        reina_annotation = row['Human Annotated Essay_Reina']
        specific_theme_gian = row['Specific Theme(s)_Gian']
        specific_theme_reina = row['Specific Theme(s)_Reina']

        # Check if annotations are empty
        both_empty = (loose_match(gian_annotation, reina_annotation) and reina_annotation == 0)
        both_match = loose_match(gian_annotation, reina_annotation)

        # Determine Reconciled Yes or No
        if both_empty:
            output_df.loc[index, 'Reconciled Yes or No'] = "No"
            output_df.loc[index, 'Reconciled Annotations'] = 0
            output_df.loc[index, 'Reconciled Specific Themes'] = 0
        elif both_match:
            output_df.loc[index, 'Reconciled Yes or No'] = "Yes"
            output_df.loc[index, 'Reconciled Annotations'] = gian_annotation
            output_df.loc[index, 'Reconciled Specific Themes'] = specific_theme_gian if loose_match(specific_theme_gian, specific_theme_reina) else 0
        else:
            output_df.loc[index, 'Reconciled Yes or No'] = "No"
            output_df.loc[index, 'Reconciled Annotations'] = 0
            output_df.loc[index, 'Reconciled Specific Themes'] = 0

    # Save to Excel with formatting and frozen top row
    wb = Workbook()
    ws = wb.active
    ws.title = "Reconciled Data"

    # Write the DataFrame to the worksheet
    for r_idx, row in enumerate(dataframe_to_rows(output_df, index=False, header=True), start=1):
        for c_idx, value in enumerate(row, start=1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            cell.alignment = Alignment(wrap_text=True)  # Enable text wrapping

            # Apply red background to unmatched rows
            if r_idx > 1 and output_df.loc[r_idx - 2, 'Reconciled Yes or No'] == "No":
                cell.fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")

    # Add borders and bold header row
    thin_border = Border(left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin"))
    for row in ws.iter_rows(min_row=1, max_row=1, max_col=len(output_df.columns)):
        for cell in row:
            cell.font = Font(bold=True)
            cell.border = thin_border

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=len(output_df.columns)):
        for cell in row:
            cell.border = thin_border

    # Freeze the top row
    ws.freeze_panes = ws["A2"]

    # Save the workbook
    wb.save(output_file)
    print(f"Reconciled file saved with formatting and frozen top row at: {output_file}")


reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file)

Reconciled file saved with formatting and frozen top row at: /Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/src/data_preprocessing/reconciliation_jan2025/reconciled.xlsx


In [4]:
import pandas as pd
from fuzzywuzzy import fuzz
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Alignment, Border, Side, Font
from openpyxl.utils.dataframe import dataframe_to_rows

def reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file, similarity_threshold=80):
    """
    Reconcile annotations between two annotators with loose matching and apply conditional formatting.

    Parameters:
    - file_gian (str): Path to Gian's Excel file.
    - file_reina (str): Path to Reina's Excel file.
    - output_file (str): Path to save the prepared file.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    # Load data from both files
    df_gian = pd.read_excel(file_gian)
    df_reina = pd.read_excel(file_reina)

    # Common shared columns
    shared_columns = [
        'Essay ID', 'Year', 'Semester', 'Class', 'Type', 
        'Section', 'Alma ID', 'Navigational Present', 'Annotated Essays'
    ]
    
    # Merge files on shared columns
    merged_df = pd.merge(
        df_gian[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)', 'Human Theme Presence (Yes or No)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Gian',
                'Specific Theme(s)': 'Specific Theme(s)_Gian',
                'Human Theme Presence (Yes or No)': 'Human Theme Presence (Yes or No)_Gian'
            }
        ),
        df_reina[shared_columns + ['Human Annotated Essay', 'Specific Theme(s)']].rename(
            columns={
                'Human Annotated Essay': 'Human Annotated Essay_Reina',
                'Specific Theme(s)': 'Specific Theme(s)_Reina'
            }
        ),
        on=shared_columns
    )

    # Create a new DataFrame for the output
    output_df = merged_df[shared_columns].copy()

    # Add annotation and theme columns
    output_df['Gian Annotations'] = merged_df['Human Annotated Essay_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Annotations'] = merged_df['Human Annotated Essay_Reina'].astype(str).str.strip().str.lower()
    output_df['Gian Specific Themes'] = merged_df['Specific Theme(s)_Gian'].astype(str).str.strip().str.lower()
    output_df['Reina Specific Themes'] = merged_df['Specific Theme(s)_Reina'].astype(str).str.strip().str.lower()

    # Initialize additional reconciliation columns with empty strings
    output_df['Reconciled Yes or No'] = ""
    output_df['Reconciled Annotations'] = ""
    output_df['Reconciled Specific Themes'] = ""

    # Copy "Human Theme Presence (Yes or No)" from Gian's annotations
    output_df['Human Theme Presence (Yes or No)'] = merged_df['Human Theme Presence (Yes or No)_Gian']

    # Define a function for loose matching
    def loose_match(annotation1, annotation2):
        # Ensure both annotations are valid strings
        annotation1 = str(annotation1) if not pd.isna(annotation1) else ""
        annotation2 = str(annotation2) if not pd.isna(annotation2) else ""
        return fuzz.ratio(annotation1, annotation2) >= similarity_threshold

    # Perform reconciliation
    for index, row in merged_df.iterrows():
        gian_annotation = row['Human Annotated Essay_Gian']
        reina_annotation = row['Human Annotated Essay_Reina']
        specific_theme_gian = row['Specific Theme(s)_Gian']
        specific_theme_reina = row['Specific Theme(s)_Reina']

        # Check if annotations are empty
        both_empty = (loose_match(gian_annotation, reina_annotation) and reina_annotation == 0)
        both_match = loose_match(gian_annotation, reina_annotation)

        # Determine Reconciled Yes or No
        if both_empty:
            output_df.loc[index, 'Reconciled Yes or No'] = "No"
            output_df.loc[index, 'Reconciled Annotations'] = 0
            output_df.loc[index, 'Reconciled Specific Themes'] = 0
        elif both_match:
            output_df.loc[index, 'Reconciled Yes or No'] = "Yes"
            output_df.loc[index, 'Reconciled Annotations'] = gian_annotation
            output_df.loc[index, 'Reconciled Specific Themes'] = specific_theme_gian if loose_match(specific_theme_gian, specific_theme_reina) else 0
        else:
            output_df.loc[index, 'Reconciled Yes or No'] = "X"
            output_df.loc[index, 'Reconciled Annotations'] = "X"
            output_df.loc[index, 'Reconciled Specific Themes'] = "X"

    # Save to Excel with formatting and conditional red highlight
    wb = Workbook()
    ws = wb.active
    ws.title = "Reconciled Data"

    # Set the column widths for I, J, and K
    ws.column_dimensions['I'].width = 40  # Adjust width as needed
    ws.column_dimensions['J'].width = 40  # Adjust width as needed
    ws.column_dimensions['K'].width = 40  # Adjust width as needed

    # Set the column widths for I, J, and K
    ws.column_dimensions['N'].width = 30  # Adjust width as needed
    ws.column_dimensions['O'].width = 30  # Adjust width as needed
    ws.column_dimensions['P'].width = 30  # Adjust width as needed
    

    # Write the DataFrame to the worksheet
    for r_idx, row in enumerate(dataframe_to_rows(output_df, index=False, header=True), start=1):
        for c_idx, value in enumerate(row, start=1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            cell.alignment = Alignment(wrap_text=True)  # Enable text wrapping

            # Apply red background to columns N, O, P only if all three contain "X"
            if r_idx > 1:
                reconciled_yes_no = output_df.loc[r_idx - 2, 'Reconciled Yes or No']
                reconciled_annotations = output_df.loc[r_idx - 2, 'Reconciled Annotations']
                reconciled_specific_themes = output_df.loc[r_idx - 2, 'Reconciled Specific Themes']

                # Highlight cells in N, O, and P if all three contain "X"
                if reconciled_yes_no == "X" and reconciled_annotations == "X" and reconciled_specific_themes == "X":
                    if c_idx in [output_df.columns.get_loc('Reconciled Yes or No') + 1,
                                output_df.columns.get_loc('Reconciled Annotations') + 1,
                                output_df.columns.get_loc('Reconciled Specific Themes') + 1]:
                        cell.fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")
                else:
                    # Clear the fill if the condition is not met
                    cell.fill = PatternFill(fill_type=None)

    # Add borders and bold header row
    thin_border = Border(left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin"))
    for row in ws.iter_rows(min_row=1, max_row=1, max_col=len(output_df.columns)):
        for cell in row:
            cell.font = Font(bold=True)
            cell.border = thin_border

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=len(output_df.columns)):
        for cell in row:
            cell.border = thin_border

    # Freeze the top row
    ws.freeze_panes = ws["A2"]

    # Save the workbook
    wb.save(output_file)
    print(f"Reconciled file saved with formatting and red highlights at: {output_file}")


# reconcile_annotations_with_loose_matching(file_gian, file_reina, output_file)


In [5]:
import os
import pandas as pd

def reconcile_annotations_in_folder(gian_root, reina_root, output_root, similarity_threshold=80):
    """
    Reconciles annotation files found in a directory structure, creating output in a 'reconciled' folder.

    Parameters:
    - gian_root (str): Root directory for Gian's annotations.
    - reina_root (str): Root directory for Reina's annotations.
    - output_root (str): Root directory for reconciled output files.
    - similarity_threshold (int): Minimum similarity score for fuzzy matching (default=80).
    """
    def create_output_path(file_path, base_root, output_base):
        """
        Generate the output path for a reconciled file.
        """
        relative_path = os.path.relpath(file_path, base_root)  # Get relative path
        output_path = os.path.join(output_base, relative_path)  # Append to output base
        return os.path.splitext(output_path)[0] + "_reconciled.xlsx"  # Add "_reconciled" to filename

    # Traverse Gian's directory structure
    for root, _, files in os.walk(gian_root):
        for file in files:
            if file.endswith(".xlsx") and "Essay1" in file:
                # Determine corresponding file paths
                file_gian = os.path.join(root, file)
                relative_path = os.path.relpath(file_gian, gian_root)
                file_reina = os.path.join(reina_root, relative_path)

                # Check if the corresponding file exists in Reina's directory
                if os.path.exists(file_reina):
                    # Create the output path
                    output_file = create_output_path(file_gian, gian_root, output_root)

                    # Ensure the output directory exists
                    os.makedirs(os.path.dirname(output_file), exist_ok=True)

                    # Run reconciliation on the matched files
                    try:
                        reconcile_annotations_with_loose_matching(
                            file_gian=file_gian,
                            file_reina=file_reina,
                            output_file=output_file,
                            similarity_threshold=similarity_threshold,
                        )
                        print(f"Reconciled: {output_file}")
                    except Exception as e:
                        print(f"Error reconciling {file_gian} and {file_reina}: {e}")
                else:
                    print(f"No matching file for: {file_gian} in {reina_root}")

# Example usage
gian_root = '/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/gian_to_be_validated/Navigational/Spring 2020'
reina_root = '/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/reina_to_be_validated/Navigational/Spring 2020'
output_root = "/Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/Data/reconciled/Navigational/Spring 2020"

reconcile_annotations_in_folder(gian_root, reina_root, output_root)

Reconciled file saved with formatting and red highlights at: /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/Data/reconciled/Navigational/Spring 2020/SCI 0227/Section 02/Spring2020_SCI0227-02_Essay1_anonymized_Navigational_tacited_reconciled.xlsx
Reconciled: /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/Data/reconciled/Navigational/Spring 2020/SCI 0227/Section 02/Spring2020_SCI0227-02_Essay1_anonymized_Navigational_tacited_reconciled.xlsx
Reconciled file saved with formatting and red highlights at: /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/Data/reconciled/Navigational/Spring 2020/SCI 0227/Section 01/Spring2020_SCI0227-01_Essay1_anonymized_Navigational_tacited_reconciled.xlsx
Reconciled: /Users/gbaldonado/Library/CloudStorage/Box-Box/ALMA Project/ALMA 2024/Data/Data/reconciled/Navigational/Spring 2020/SCI 0227/Section 01/Spring2020_SCI0227-01_Essay1_anonymized_Navigational_tacited_reconciled.xlsx
Re