In [None]:
import sys
from glob import glob
from collections import defaultdict

sys.path.append("/scratch/group/csce435-f23/python-3.8.17/lib/python3.8/site-packages")
sys.path.append("/scratch/group/csce435-f23/thicket")

import pandas as pd

import thicket as th

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# POINT THIS AT YOUR DATA

In [None]:
# Point this at the top directory of all your cali files for all of the implementations. Other files can be in the directory too, that is ok.
FILES_LOCATION = "cali_files/*.cali"

# Reader check
Can the files be read in one-by-one

In [None]:
working_files = []
error_files = []
i = 0
team_files = glob(f"{FILES_LOCATION}/**/*.cali", recursive=True)
for file in team_files:
    try:
        tk = th.Thicket.from_caliperreader(file)
        working_files.append(file)
    except Exception:
        i += 1
        error_files.append(file)

In [None]:
print("Files that could be read in individually (one-by-one):")
print(f"{len(working_files)}/{len(team_files)} ({len(working_files)/len(team_files)*100}%)")

# Check for Metadata columns

Check for the necessary metadata columns from the [report](https://github.com/TAMU-CSCE435-Pearce/Project/blob/master/Report.md#3b-collect-metadata)

In [None]:
team_metadata_valid = []
metadata_columns=['cali.caliper.version', 'spot.options', 'spot.channels', 'cali.channel',
    'launchdate',
    'libraries', 'cmdline', 'cluster', 'Algorithm', 'ProgrammingModel',
    'Datatype', 'SizeOfDatatype', 'InputSize', 'InputType',
    'group_num', 'implementation_source']
mpi_cols = ['num_procs']
cuda_cols = ['num_threads', 'num_blocks',]
metadata_col_dict = defaultdict(lambda: [])

team_files = glob(f"{FILES_LOCATION}/**/*.cali", recursive=True)
for file in team_files:
    try:
        valid = True
        tk = th.Thicket.from_caliperreader(file)
        cols = tk.metadata.columns
        model_to_check = []
        if "CUDA" in tk.metadata["ProgrammingModel"].to_list()[0].upper():
            model_to_check = metadata_columns + cuda_cols
        else:
            model_to_check = metadata_columns + mpi_cols
        for col in model_to_check:
            if col not in cols:
                metadata_col_dict[list(tk.profile_mapping.values())[0]].append(col)
                valid=False
        if valid:
            team_metadata_valid.append(file)
    except KeyError:
        pass

for file, cols in metadata_col_dict.items():
    print(f"File '{file}' missing metadata columns:\n\t{cols}")

# Check for DataFrame columns

Check for the necessary DataFrame columns from the [report](https://github.com/TAMU-CSCE435-Pearce/Project/blob/master/Report.md#4c-you-should-measure-the-following-performance-metrics). For the GPU columns, you need one or the other column in the tuple, not both.

In [None]:
team_dataframe_valid = []
necessary_columns = ["Min time/rank","Max time/rank","Avg time/rank","Total time",]
not_gpu_columns = ["Variance time/rank",]
gpu_columns=[("Avg GPU time/rank", "Avg GPU Time/rank"),
             ("Min GPU time/rank", "Min GPU Time/rank"),
             ("Max GPU time/rank", "Max GPU Time/rank"),
             ("Total GPU time", "Total GPU Time"),]
def check_df_cols(tk, dict):
    valid = True
    cols = tk.dataframe.columns
    for col in necessary_columns:
        if col not in cols:
            dict[list(tk.profile_mapping.values())[0]].append(col)
            valid = False
    if "ProgrammingModel" in tk.metadata.columns:
        if "CUDA" in tk.metadata["ProgrammingModel"].to_list()[0].upper():
            for col in gpu_columns:
                if col[0] not in cols and col[1] not in cols:
                    dict[list(tk.profile_mapping.values())[0]].append(col)
                    valid = False
        else:
            for col in not_gpu_columns:
                if col not in cols:
                    dict[list(tk.profile_mapping.values())[0]].append(col)
                    valid = False
    return valid

dataframe_col_dict = defaultdict(lambda: [])
team_files = glob(f"{FILES_LOCATION}/**/*.cali", recursive=True)
for file in team_files:
    tk = th.Thicket.from_caliperreader(file)
    valid = check_df_cols(tk, dataframe_col_dict)
    if valid:
        team_dataframe_valid.append(file)

for file, cols in dataframe_col_dict.items():
    print(f"File '{file}' missing dataframe columns: {cols}")

# Try all files together

In [None]:
tk = th.Thicket.from_caliperreader(team_files)

# Check tree

Should be no different from the [report](https://github.com/TAMU-CSCE435-Pearce/Project/blob/master/Report.md#3a-caliper-instrumentation), spelling and all.

In [None]:
tk.statsframe.dataframe["time"] = 1
print(tk.tree())

In [None]:
# Groupby programming model. Should result in 2 thickets, MPI and CUDA.
gb_pmodel = tk.groupby("ProgrammingModel")

In [None]:
# Groupby the parameters we ran with. After this operation, each Thicket in gb_total should contain profiles with unique InputSizes (there should be no duplicate input sizes).
gb_cuda = gb_pmodel["CUDA"].groupby(["ProgrammingModel", "Algorithm", "InputType", "num_threads"])
gb_mpi = gb_pmodel["MPI"].groupby(["ProgrammingModel", "Algorithm", "InputType", "num_procs"])
gb_total = {**gb_cuda, **gb_mpi}

In [None]:
# Compose all of the data back together. If this step errors, you probably have duplicate inputsizes. Run 1a to check for this.
ctk = th.Thicket.concat_thickets(
    thickets=list(gb_total.values()),
    axis="columns",
    headers=list(gb_total.keys()),
    metadata_key="InputSize"
)

In [None]:
ctk.dataframe.head(50)

# 1A

Check for duplicate input sizes

In [None]:
i = 0
for key in list(gb_total.keys()):
    print(i)
    print(gb_total[key].profile_mapping)
    print(gb_total[key].metadata["InputSize"])
    i += 1