In [221]:
import os
import pandas as pd
import numpy as np

### LOADING & STORING DATA

In [222]:
# loading the files for all the data
set1_all_before = pd.read_csv('set_1/AllDataBefore.csv')
set1_all_after = pd.read_csv('set_1/AllDataAfter.csv')
set2_all_before = pd.read_csv('set_2/AllDataBefore.csv')
set2_all_after = pd.read_csv('set_2/AllDataAfter.csv')

# defining the path to the folders
set1_before_folder_path = '/Users/andrewcarranti/CODE/AI_CODE/SPRING/set_1/before/'
set1_after_folder_path = '/Users/andrewcarranti/CODE/AI_CODE/SPRING/set_1/after/'
set2_before_folder_path = '/Users/andrewcarranti/CODE/AI_CODE/SPRING/set_2/before/'
set2_after_folder_path = '/Users/andrewcarranti/CODE/AI_CODE/SPRING/set_2/after/'

# empty dictionary to store the data frames for each file in each set and their respective before & after
set1_before_data_dict = {}
set1_after_data_dict = {}
set2_before_data_dict = {}
set2_after_data_dict = {}
folder_paths_dicts = [(set1_before_folder_path, set1_before_data_dict), (set1_after_folder_path, set1_after_data_dict), 
                      (set2_before_folder_path, set2_before_data_dict), (set2_after_folder_path, set2_after_data_dict)]

# loop through the csv files in the folder
for folder_path, data_dict in folder_paths_dicts:
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv') & filename.startswith('_') == False:
            # read in the Excel file as a data frame
            df = pd.read_csv(os.path.join(folder_path, filename))
            
            # store the data frame in the dictionary with the filename as the key
            data_dict[filename] = df

### CHECK

In [223]:
print("set1 before & after:", len(set1_before_data_dict), "&", len(set1_after_data_dict))
print("set2 before & after:", len(set2_before_data_dict), "&", len(set2_after_data_dict))

set1 before & after: 192 & 192
set2 before & after: 236 & 236


### SETUP

In [224]:
# create an empty dictionary to store the results
set1_decreased_before_dict = {}
set1_decreased_after_dict = {}

set1_increased_before_dict = {}
set1_increased_after_dict = {}

set2_decreased_before_dict = {}
set2_decreased_after_dict = {}

set2_increased_before_dict = {}
set2_increased_after_dict = {}

metrics_dicts = [("set1", set1_decreased_before_dict, set1_decreased_after_dict, set1_increased_before_dict, set1_increased_after_dict, set1_before_data_dict, set1_after_data_dict), 
                ("set2", set2_decreased_before_dict, set2_decreased_after_dict, set2_increased_before_dict, set2_increased_after_dict, set2_before_data_dict, set2_after_data_dict)]

### IDENTIFY DECREASES (INCREASES) IN METRICS : IMPROVEMENT (DEGRADATION)

In [225]:
# iterate over the two sets
for set, decreased_before_metrics_dict, decreased_after_metrics_dict, increased_before_metrics_dict, increased_after_metrics_dict, before_data_dict, after_data_dict in metrics_dicts:

    # iterate over the "before" data frames
    for before_key, before_df in before_data_dict.items():

        # get the corresponding "after" data frame
        after_key = before_key.replace("_before.csv", "_after.csv")
        after_df = after_data_dict.get(after_key)

        # filter the rows with "Kind" containing the word "class"
        before_class_df = before_df[before_df['Kind'].str.contains("Class")]
        after_class_df = after_df[after_df['Kind'].str.contains("Class")]

        # skip the data frame if it has no classes
        if len(before_df) == 0 or len(after_df) == 0:
            continue        

        # inner merge based on name
        merged_df = pd.merge(before_class_df, after_class_df, on = 'Name', how = 'inner')

        # skip the data frame if it has no common classes
        if len(merged_df) == 0:
            continue  

        # get the columns that end with "_x" from the merged data frame
        before_columns = [col for col in merged_df.columns if col.endswith("_x")]
        # get the columns that end with "_y" from the merged data frame
        after_columns = [col for col in merged_df.columns if col.endswith("_y")]

        # ddd the "Name" column to both sets of columns
        before_columns.insert(0, "Name")
        after_columns.insert(0, "Name")

        # separate the merged data frame into original data frames
        before_class_df = merged_df[before_columns]
        after_class_df = merged_df[after_columns]

        # remove the "_x" at the end of each column name
        before_class_df.columns = [col.rstrip("_x") if col.endswith("_x") else col for col in before_columns]
        # remove the "_y" at the end of each column name
        after_class_df.columns = [col.rstrip("_y") if col.endswith("_y") else col for col in after_columns]
        
        # drop the "Kind" and "Name" columns so can numerically compare
        before_class_metrics_df = before_class_df.drop(columns=['Kind', 'Name'])
        after_class_metrics_df = after_class_df.drop(columns=['Kind', 'Name'])

        # empty list to store the rows with at least one decrease/increase in value
        decreased_rows = []
        increased_rows = []

        # iterate over the rows in the data frames
        for index, rowB in before_class_metrics_df.iterrows():
            rowA = after_class_metrics_df.loc[index]

            # check if any column has a decrease in value
            if (np.sum(rowB > rowA) > len(rowB) / 2):
                decreased_rows.append(index)

            # check if any column has an incerase in value
            if (np.sum(rowB < rowA) < len(rowB) / 2):
                increased_rows.append(index)

        # filter the original data frames based on decreased/increased indices
        decreased_before_df = before_class_df.loc[decreased_rows]
        increased_before_df = before_class_df.loc[increased_rows]
        decreased_after_df = after_class_df.loc[decreased_rows]
        increased_after_df = after_class_df.loc[increased_rows]

    # add the file names & their corresponding data frame to the appropriate dictionary
    if set == "set1":
        if not decreased_before_df.empty:
            set1_decreased_before_dict[before_key] = decreased_before_df
            
        if not decreased_after_df.empty:
            set1_decreased_after_dict[after_key] = decreased_after_df

        if not decreased_before_df.empty:
            set1_increased_before_dict[before_key] = increased_before_df

        if not decreased_after_df.empty:
            set1_increased_after_dict[after_key] = increased_after_df
        
    elif set == "set2":
        if not decreased_before_df.empty:
            set2_decreased_before_dict[before_key] = decreased_before_df

        if not decreased_after_df.empty:
            set2_decreased_after_dict[after_key] = decreased_after_df

        if not decreased_before_df.empty:
            set2_increased_before_dict[before_key] = increased_before_df

        if not decreased_after_df.empty:
            set2_increased_after_dict[after_key] = decreased_after_df

### EXPORTING

In [226]:
dictionaries = [(set1_decreased_before_dict, "set1_decreased_before_dict"),
                (set1_decreased_after_dict, "set1_decreased_after_dict"),
                (set1_increased_before_dict, "set1_increased_before_dict"),
                (set1_increased_after_dict, "set1_increased_after_dict"),
                (set2_decreased_before_dict, "set2_decreased_before_dict"),
                (set2_decreased_after_dict, "set2_decreased_after_dict"), 
                (set2_increased_before_dict, "set2_increased_before_dict"), 
                (set2_increased_after_dict, "set2_increased_after_dict")]

# for each dictionary & folder name for the dictionary to contain the files
for dictionary, name in dictionaries:

    # for each file name (key) & dataframe (value) in the dictionary
    for file_name, df in dictionary.items():
        if not os.path.exists(name):
            os.makedirs(name)
            
        # create file path
        file_path = os.path.join(name, file_name)
        
        # export to csv
        df.to_csv(file_path, index=False)  # Export DataFrame to CSV

print("Export complete!")

Export complete!
