# Preparing Reconciled Files Into Single Dataframe for Reina

*Author: Gian Carlo L. Baldonado*

In [23]:
import xlwings as xw
import os
import glob
from collections import defaultdict

In [87]:
def extract_semester_and_course_from_path(file_paths):
    "Extract semester and course info for sheets"
    semester_course_list = []
    for path in file_paths:
        parts = path.split('/')
        # Extracting "Fall 2020" and the course info like "ASTR 0116"
        semester = parts[-4]  # "Fall 2020"
        course = parts[-3]    # "ASTR 0116" or "PHYS 0112"
        semester_course_list.append((semester, course))
    return semester_course_list

In [285]:
def get_files_by_semester_and_course(base_path):
    files_by_semester_and_course = defaultdict(list)
    
    # Construct the pattern to match all Excel files in the folder hierarchy
    pattern = os.path.join(base_path, '**', '*.xlsx')
    pattern_m = os.path.join(base_path, '**', '*.xlsm')
    
    # Get all matching file paths
    file_paths = glob.glob(pattern, recursive=True) + glob.glob(pattern_m, recursive=True)
    
    # Filter out temporary files
    file_paths = [fp for fp in file_paths if not os.path.basename(fp).startswith('~$')]
    
    # Extract semester and course info
    semester_course_list = extract_semester_and_course_from_path(file_paths)
    
    for (semester, course), file_path in zip(semester_course_list, file_paths):
        files_by_semester_and_course[(semester, course)].append(file_path)
    
    return files_by_semester_and_course


files_grouped = get_files_by_semester_and_course(dummy_input)

for key, file_list in files_grouped.items():
    semester, course = key
    print(f"Semester: {semester}, Course: {course}")
    for file_path in file_list:
        print(f"  - {file_path}")

Semester: Fall 2020, Course: ASTR 0116
  - /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_tacited_reconciled.xlsm
Semester: Fall 2020, Course: PHYS 0112
  - /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 02/Fall2020_PHYS0112-02_Essay1_anonymized_tacited_reconciled.xlsm
  - /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 01/Fall2020_PHYS0112-01_Essay1_anonymized_tacited_reconciled.xlsm
Semester: Fall 2021, Course: PHYS 0102
  - /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2021/PHYS 0102/Section 02/Fall2021_PHYS0102-02_Essay1_anonymized_tacited_reconciled.xlsm
  - /Users/gbaldonado/Library/Containers/co

In [286]:
files_grouped

defaultdict(list,
            {('Fall 2020',
              'ASTR 0116'): ['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_tacited_reconciled.xlsm'],
             ('Fall 2020',
              'PHYS 0112'): ['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 02/Fall2020_PHYS0112-02_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 01/Fall2020_PHYS0112-01_Essay1_anonymized_tacited_reconciled.xlsm'],
             ('Fall 2021',
              'PHYS 0102'): ['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2021/PHYS 0102/Section 02/Fall2021_PHYS0102-02_Essay1_anonymized_ta

In [287]:
for i in list(files_grouped.values()):
    print(i)

['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_tacited_reconciled.xlsm']
['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 02/Fall2020_PHYS0112-02_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 01/Fall2020_PHYS0112-01_Essay1_anonymized_tacited_reconciled.xlsm']
['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2021/PHYS 0102/Section 02/Fall2021_PHYS0102-02_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2021/PHYS 0102/Section 01/Fall2021_PHYS0102

In [288]:
len(list(files_grouped.values()))

3

In [487]:
def process_reconciled_files(reconciled_files_folder_path):
    print("Starting process")
    print("Loading workbooks...")

    input_dir_grouped_by_semester_course_dict = get_files_by_semester_and_course(reconciled_files_folder_path)

    # Instantiate the output workbook
    output_wb = xw.Book()
    for key, file_list in input_dir_grouped_by_semester_course_dict.items():
            semester, course = key
            sheet_name = f"{semester} - {course}"
            print("Sheet Name: ", sheet_name)
            output_sheet = output_wb.sheets.add(sheet_name)

            output_sheet.range("A1").value = ["Essay ID", "TACIT Theme Present", "TACITED Essay", "Reina Theme Present", "Lokesh Theme Present", 
                                              "Reconciled Theme Present", "Reina Phrase Only", "Lokesh Phrase Only", "Reconciled Phrase"]

            for reconciled_file in file_list:
                print(f"Processing {reconciled_file}")
                wb = xw.Book(reconciled_file)
                sheet = wb.sheets.active

                # Retrieving data from the current sheet
                essay_id = sheet.range('B2').expand("down").value
                tacited_theme_present = sheet.range('I2').expand("down").value
                sentence = sheet.range("J2").expand("down").value
                reina_theme_present = sheet.range("K2").expand("down").value
                lokesh_theme_present = sheet.range("L2").expand("down").value
                label = sheet.range("M2").expand("down").value
                reina_phrase = sheet.range("P2").expand("down").value
                lokesh_phrase = sheet.range("Q2").expand("down").value
                phrase = sheet.range("R2").expand("down").value 


                      # Ensure data is lists
                if not isinstance(sentence, list):
                    sentence = [sentence]
                if not isinstance(label, list):
                    label = [label]
                if not isinstance(phrase, list):
                    phrase = [phrase]


            #    # Log data lengths
            #     print(f"Data lengths - Sentences: {len(sentence)}, Labels: {len(label)}, Phrases: {len(phrase)}")

                # Find the next available row in the output sheet
                last_row_A = output_sheet.range('A' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_B = output_sheet.range('B' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_C = output_sheet.range('C' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_D = output_sheet.range('D' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_E = output_sheet.range('E' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_F = output_sheet.range('F' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_G = output_sheet.range('G' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_H = output_sheet.range('H' + str(output_sheet.cells.last_cell.row)).end('up').row
                last_row_I = output_sheet.range('I' + str(output_sheet.cells.last_cell.row)).end('up').row

                next_row = max(last_row_A, last_row_B, last_row_C, last_row_D, last_row_E, last_row_F, last_row_G, last_row_H, last_row_I) + 1

                # print(f"Last rows - A: {last_row_A}, B: {last_row_B}, C: {last_row_C}")
                print(f"Next row to start data append: {next_row}")

                # Append the data to the output sheet vertically
                output_sheet.range(f'A{next_row}').options(transpose=True).value = essay_id
                output_sheet.range(f'B{next_row}').options(transpose=True).value = tacited_theme_present
                output_sheet.range(f'C{next_row}').options(transpose=True).value = sentence
                output_sheet.range(f'D{next_row}').options(transpose=True).value = reina_theme_present
                output_sheet.range(f'E{next_row}').options(transpose=True).value = lokesh_theme_present
                output_sheet.range(f'F{next_row}').options(transpose=True).value = label
                output_sheet.range(f'G{next_row}').options(transpose=True).value = reina_phrase
                output_sheet.range(f'H{next_row}').options(transpose=True).value = lokesh_phrase
                output_sheet.range(f'I{next_row}').options(transpose=True).value = phrase
                

                wb.close()


    # Save the output workbook (you can choose a specific filename)
    output_wb.save("reconciled_data.xlsx")
    print("Process completed!")

In [488]:
input = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_2_reconciled"

In [489]:
dummy_input = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy"

In [491]:
process_reconciled_files(dummy_input)

Starting process
Loading workbooks...
Sheet Name:  Fall 2020 - ASTR 0116
Processing /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_tacited_reconciled.xlsm
Next row to start data append: 2
Sheet Name:  Fall 2020 - PHYS 0112
Processing /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 02/Fall2020_PHYS0112-02_Essay1_anonymized_tacited_reconciled.xlsm
Next row to start data append: 2
Processing /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_folder_hierarchy/Fall 2020/PHYS 0112/Section 01/Fall2020_PHYS0112-01_Essay1_anonymized_tacited_reconciled.xlsm
Next row to start data append: 34
Sheet Name:  Fall 2021 - PHYS 0102
Processing /Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input_with_

In [None]:
def process_reconciled_files(reconciled_files_folder_path):
    # Load both workbooks
    print("Starting process")
    print("Loading workbooks...")

    input_dir_grouped_by_semester_course_dict = get_files_by_semester_and_course(reconciled_files_folder_path)

    # Instantiate the output workbook
    output_wb = xw.Book()

    for key,file_list in input_dir_grouped_by_semester_course_dict.items():
        semester, course = key
        sheet_name = f"{semester} - {course}"
        print("Sheet Name ", sheet_name)
        output_sheet = output_wb.sheets.add(sheet_name)
      
        # output_sheet = output_wb.sheets[sheet_name] # create a sheet with the semester and course name

        # Set the column names starting from A1
        output_sheet.range('A1').value = "Essay ID"
        output_sheet.range("B1").value = "Theme Present"
        # output_sheet.range("C1").value = "Tacit-Annotated Essay"
        # output_sheet.range("D1").value = "Reina Theme Present"
        # output_sheet.range("E1").value = "Lokesh Theme Present"
        # output_sheet.range("F1").value = "Label"
        # output_sheet.range("G1").value = "Reina Red-only Annotation"
        # output_sheet.range("H1").value = "Lokesh Red-only Annotation"
        # output_sheet.range("I1").value = "Reconciled Red-only Annotation"
        # Initialize xlwings app

        for input_path in file_list:
            print(f"Processing {input_path}")
            wb = xw.Book(input_path)
            sheet = wb.sheets.active

            # Retrieving data from the current sheet
            essay_id = sheet.range("B2").expand("down").value
            theme_presence = sheet.range("I2").expand("down").value
            # Retrieve other data if needed

            # Find the last row with data in the output sheet
            last_row = output_sheet.range('A' + str(output_sheet.cells.last_cell.row)).end('up').row

            # Append the data to the output sheet
            output_sheet.range(f'A{last_row + 1}').value = essay_id
            output_sheet.range(f'B{last_row + 1}').value = theme_presence
        # Save and close the output workbook
        output_wb.save("./final_processed_file_version_2.xlsm")
        output_sheet.book.close()

        print("Finished processing reconciled files.")
            
    return output_wb
    

