# Preparing Reconciled Files Into Single Dataframe for Model

*Author: Gian Carlo L. Baldonado*

In [1]:
import xlwings as xw
import os
import glob

In [2]:
# Input file paths
# reconciled file
reconciled_file = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input/Fall2020_PHYS0102-02_Essay1_anonymized_tacited_reconciled.xlsm"

### Process reconciled file into a dataframe for the model

cols = "sentence", "label", "phrase"

In [3]:
def process_reconciled_labels(k_col, l_col, m_col):
    """
    Reconciling labels
    input: 
    * reina's labels -> k_col
    * lokesh's lables -> l_col
    * reconciled column range -> m_col
    """
    print("Processing labels...")
    for k_cell, l_cell, m_cell in zip(k_col[1:], l_col[1:], m_col):
        # if reina's labels match lokesh's labels
        if k_cell.value == l_cell.value:
            # the reconciled value is the same matching value
            m_cell.value = k_cell.value
        # if not
        else:
            # then the reconciled value is undetermined 
            m_cell.value = "X"
            m_cell.font.color = (0,0,0)
            m_cell.color = (255, 0, 0)  # Set font color to red
    print("Done!")

def merge_strings(a, b):
    """
    Helper function to merge two similar strings
    To be used for Case 4
    """
    # Split strings into lists of words
    words_a = a.split()
    words_b = b.split()

    # Find the common prefix
    common_prefix = []
    for word_a, word_b in zip(words_a, words_b):
        if word_a == word_b:
            common_prefix.append(word_a)
        else:
            break

    # Join the common prefix and the remaining words from b
    merged_string = " ".join(common_prefix + words_b[len(common_prefix):])

    return merged_string


# merged_string = merge_strings(cell_n15, cell_o15)

# #Return the string to P
# print(merged_string)

def process_reconciled_annotations(n_col, o_col, p_col, q_col, r_col, combined_sheet):
    """
    Reconciling annotations
    input: 
    * reina's annotations -> n_col
    * lokesh's annotations -> o_col
    * reina's red-only annotations -> p_col
    * lokesh's red-only annotations -> q_col
    * reconciled column r range -> r_col
    """
    print("Processing annotations...")

    # For each value in n_col, o_col, p_col, q_col, r_col
    for n_cell, o_cell, p_cell, q_cell, r_cell in zip(n_col[1:], o_col[1:], p_col, q_col, r_col):

        # Get the cell addresses of Reina and Lokesh's annotations 
        n_cell_address = n_cell.address # The address of Reina's annotation cell ex. N1
        o_cell_address = o_cell.address # Address of Lokesh's annotation cell ex. O1
        p_cell_address = p_cell.address # Address of cell at column P, this is where we will put only the red annotations of Reina
        q_cell_address = q_cell.address # Address of cell at column Q, this is where we will put only the red annotations of Lokesh
        r_cell_address = r_cell.address # Address of cell at column R, this is where we will put the reconciled red text from Reina and Lokesh

        # Pass these cell addresses to get_color_text(x)
        n_cell_color_only = f"=IF(GetColorText({n_cell_address})=\"\",0,GetColorText({n_cell_address}))"
        o_cell_color_only = f"=IF(GetColorText({o_cell_address})=\"\",0,GetColorText({o_cell_address}))"

        combined_sheet.range(p_cell_address).value = n_cell_color_only
        combined_sheet.range(q_cell_address).value = o_cell_color_only

        # print(n_cell_color_only)

        p_cell_value = combined_sheet.range(p_cell_address).value
        q_cell_value = combined_sheet.range(q_cell_address).value
        r_cell_value = combined_sheet.range(r_cell_address).value

        # print(p_cell_value)
        # print(q_cell_value)
        # print(r_cell_value)
      
        # Case 2 and 3
        if (p_cell_value == q_cell_value): # Return 0 or that red-only text
            r_cell.value = p_cell_value
        # Case 4:
        elif (p_cell_value != q_cell_value) and p_cell_value != 0 and q_cell_value != 0:   
            r_cell.value = merge_strings(p_cell_value, q_cell_value)
        # Case 1: 
        else:
            # r_cell value will n_cell or o_cell depending if whether it's a string or not
            r_cell.value = n_cell.value if isinstance(p_cell.value, str) else o_cell.value if isinstance(q_cell.value, str) else 0
            r_cell.font.color = (0,0,0)
            r_cell.color = (255,255,0)  # Set cell color to yellow

    print("Done!")

In [4]:
def process_reconciled_files(reconciled_files_folder_path):
    # Load both workbooks
    print("Starting process")
    print("Loading workbooks...")

    # create a list of inputs
    input_dir = sorted(glob.glob(f"{reconciled_files_folder_path}/**/*.xlsm", recursive=True))

    print(input_dir)

    # app = xw.App(visible=False)

    # Combined sheet
    output_wb = xw.Book()
    output_sheet = output_wb.sheets.active

    output_sheet.range("A1").value = "essay"
    output_sheet.range("B1").value = "label"
    output_sheet.range("C1").value = "phrase"

    for reconciled_file in input_dir:
        print(f"Processing {reconciled_file}")
        wb = xw.Book(reconciled_file)
        sheet = wb.sheets.active

        # Populating combined sheet
        # Retrieving labels ("Yes" or "No") from Reina and Lokesh's sheets
        sentence = sheet.range("J2").expand("down")
        label = sheet.range("M2").expand("down")
        phrase = sheet.range("R2").expand("down") 

        # phrase.value
        # print(phrase.value)

        sentence.copy(output_sheet.range('A2').end('down').offset(row_offset=1))
        label.copy(output_sheet.range('B2').end('down').offset(row_offset=1))
        phrase.copy(output_sheet.range('C2').end('down').offset(row_offset=1))
        
      
        wb.close()

        # phrase.copy(output_sheet.range('A' + str(wb.sheets[0].cells.last_cell.row)).end('up').row)

    output_wb.save("./final_processed_file_version_2.xlsm")

    print("Finished processing reconciled files.")
    
    return output_wb
    



In [16]:
input = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed"

In [14]:
dummy_input = "/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/finalizing_data/dummy_input"

In [22]:
process_reconciled_files(input)

Starting process
Loading workbooks...
['/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed/Fall 2020/ASTR 0116/Section 02/Fall2020_ASTR0116-02_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed/Fall 2020/PHYS 0112/Section 01/Fall2020_PHYS0112-01_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed/Fall 2020/PHYS 0112/Section 02/Fall2020_PHYS0112-02_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed/Fall 2020/PHYS 0112/Section 03/Fall2020_PHYS0112-03_Essay1_anonymized_tacited_reconciled.xlsm', '/Users/gbaldonado/Library/Containers/com.microsoft.Excel/Data/reconciled/attainment/batch_being_reviewed/Fall 2020/PHYS 0112/Section 04/Fall2020_PHYS011

<Book [final_processed_file_version_2.xlsm]>

: 