In [1]:
import os
import re
import numpy as np
import pandas as pd

from plotnine import *
from plotnine import options

In [2]:
MAIN_KEY_SIZE = 'size'
MAIN_KEY_APPROACH = 'approach'

phase_order = [
    'RADIANCE',
    'REFLECTANCE',
    'ALBEDO',
    'NDVI',
    'PAI',
    'LAI',
    'ENB_EMISSIVITY',
    'EO_EMISSIVITY',
    'EA_EMISSIVITY',
    'SURFACE_TEMPERATURE',
    'SHORT_WAVE_RADIATION',
    'LARGE_WAVE_RADIATION_SURFACE',
    'LARGE_WAVE_RADIATION_ATMOSPHERE',
    'NET_RADIATION',
    'SOIL_HEAT_FLUX',
    'PIXEL_FILTER',
    'D0',
    'ZOM',
    'USTAR',
    'KB1',
    'RAH_INI',
    'RAH_CYCLE',
    'SENSIBLE_HEAT_FLUX',
    'LATENT_HEAT_FLUX',
    'NET_RADIATION_24H',
    'EVAPOTRANSPIRATION_24H',
    'P0_READ_INPUT',
    'P1_INITIAL_PROD',
    'P2_PIXEL_SEL',
    'P3_RAH',
    'P4_FINAL_PROD',
    'P5_COPY_HOST',
    'P6_SAVE_PRODS',
    'P_TOTAL',
]

# Aux functions

In [3]:
def rename_phases(df, column_name='PHASE'):
    phase_mapping = {
        'ENB_EMISSIVITY': 'ENB',
        'EO_EMISSIVITY': 'EO',
        'EA_EMISSIVITY': 'EA',
        'SURFACE_TEMPERATURE': 'ST',
        'SHORT_WAVE_RADIATION': 'SR',
        'LARGE_WAVE_RADIATION_SURFACE': 'LR_SHORT',
        'LARGE_WAVE_RADIATION_ATMOSPHERE': 'LR_LONG',
        'NET_RADIATION' : 'Rn',
        'SOIL_HEAT_FLUX': 'G',
        'SENSIBLE_HEAT_FLUX': 'H',
        'LATENT_HEAT_FLUX': 'LE',
        'NET_RADIATION_24H': 'Rn_24H',
        'EVAPOTRANSPIRATION_24H': 'EVAPO_24H',
        'P0_READ_INPUT': 'READ_INPUT',
        'P1_INITIAL_PROD': 'INITIAL_PROD',
        'P2_PIXEL_SEL': 'PIXEL_SEL',
        'P3_RAH': 'RAH',
        'P4_FINAL_PROD': 'FINAL_PROD',
        'P5_COPY_HOST': 'COPY_HOST',
        'P6_SAVE_PRODS': 'SAVE_PRODS',
        'P_TOTAL': 'TOTAL'
    }
    
    # Cria uma cópia do DataFrame para não modificar o original
    df_renamed = df.copy()
    
    # Aplica o mapeamento
    df_renamed[column_name] = df_renamed[column_name].replace(phase_mapping)
    
    return df_renamed

In [4]:
def get_mode(series):
    """Função para obter a moda de uma série."""
    mode_values = series.mode()
    # Retorna o primeiro valor da moda se houver múltiplos valores
    return mode_values.iloc[0] if not mode_values.empty else None

def combine_dfs(base_dir):
    dataframes = []

    # Walk through the directory
    for root, dirs, files in os.walk(base_dir):
        if 'time.csv' in files:
            # Read the CSV file and append the DataFrame to the list
            df = pd.read_csv(os.path.join(root, 'time.csv'))
            # df = df[df['PHASE'] != 'P6_SAVE_PRODS']
            # df.to_csv(os.path.join(root, 'time.csv'), index=False)
            dataframes.append(df[['PHASE', 'TIMESTAMP']])

    # Concatenate all DataFrames in the list into a single DataFrame
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
    else:
        combined_df = pd.DataFrame()  # Empty DataFrame if no files found

    return combined_df

In [5]:
def extract_size(key):
    match = re.search(r'(\d+)$', key)
    return int(match.group(1)) if match else None

def consolidate_dataframes(dic_mean_times, main_key=MAIN_KEY_SIZE):
    consolidated_data = []
    
    for key, df in dic_mean_times.items():
        # Extrai o tamanho (size) da chave
        size = 7295 # extract_size(key)
        name = key.split('-')[0]

        df['experiment'] = key
        if main_key == MAIN_KEY_SIZE:
            df['size'] = size
            df = df[['size', 'PHASE', 'TIMESTAMP', 'experiment']]
        elif main_key == MAIN_KEY_APPROACH:
            df['approach'] = name
            df = df[['approach', 'PHASE', 'TIMESTAMP', 'experiment']]
                
        # Adiciona o DataFrame à lista consolidada
        consolidated_data.append(df)
    
    # Concatena todos os DataFrames da lista em um único DataFrame
    combined_df = pd.concat(consolidated_data, ignore_index=True)
    
    # Usa pivot para reorganizar o DataFrame de acordo com o formato desejado
    pivot_df = combined_df.pivot_table(
        index=[main_key, 'PHASE'],
        columns='experiment',
        values='TIMESTAMP',
        aggfunc='mean'
    ).reset_index()
    
    # Renomeia as colunas para remover o prefixo criado pelo pivot
    pivot_df.columns.name = None
    
    return pivot_df

# Summarize

In [6]:
directories = {
    "./output": ["serial-0", "serial-1", "kernels-0", "kernels-1", "cutensor-0", "cutensor-1", "hybrid-0", "hybrid-1"]
}

dic_combined = dict()
for dir, subdirs in directories.items():
    for subdir in subdirs:
        subdir_path = os.path.join(dir, subdir)
        subdir_rename = subdir.replace('0', 'steep')
        subdir_rename = subdir_rename.replace('1', 'sebal')
        dic_combined[subdir_rename] = combine_dfs(subdir_path)

dic_summarized = dict()
for key in dic_combined.keys():
    combined_df = dic_combined[key]
    combined_df.groupby('PHASE')
    summarized = combined_df.groupby('PHASE')['TIMESTAMP'].mean().reset_index()
    # summarized = combined_df.groupby('PHASE')['TIMESTAMP'].apply(get_mode).reset_index()

    summarized['PHASE'] = pd.Categorical(
        summarized['PHASE'], 
        categories=phase_order, 
        ordered=True
    )
    
    summarized = summarized.sort_values('PHASE').reset_index(drop=True)

    dic_summarized[key] = summarized


# Individual prods

In [15]:
for key, df in dic_combined.items():
    df['SIZE'] = 7295
    df['LANG'] = key.split('-')[0]
    df['METHOD'] = key.split('-')[1] 
    
combined_dff = pd.concat(dic_combined.values(), ignore_index=True)
# combined_dff = combined_dff[~combined_dff['PHASE'].str.match(r'^P.\_') & (combined_dff['PHASE'] != 'P_TOTAL')]
combined_dff = rename_phases(combined_dff)
combined_dff = combined_dff.sort_values(['LANG', 'PHASE', 'METHOD'])

#select by METHOD 
sebal = combined_dff[combined_dff['METHOD'] == 'sebal'].dropna()
steep = combined_dff[combined_dff['METHOD'] == 'steep'].dropna()

#select by LANG
cpp_sebal = sebal[sebal['LANG'] == 'serial']
kernels_sebal = sebal[sebal['LANG'] == 'kernels']
hybrid_sebal = sebal[sebal['LANG'] == 'hybrid']
cutensor_sebal = sebal[sebal['LANG'] == 'cutensor']

cpp_steep = steep[steep['LANG'] == 'serial']
kernels_steep = steep[steep['LANG'] == 'kernels']
hybrid_steep = steep[steep['LANG'] == 'hybrid']
cutensor_steep = steep[steep['LANG'] == 'cutensor']

In [18]:
combined_dff

Unnamed: 0,PHASE,TIMESTAMP,SIZE,LANG,METHOD
155842,ALBEDO,5.809152,7295,cutensor,sebal
155873,ALBEDO,5.784576,7295,cutensor,sebal
155904,ALBEDO,5.788672,7295,cutensor,sebal
155935,ALBEDO,5.776384,7295,cutensor,sebal
155966,ALBEDO,5.775360,7295,cutensor,sebal
...,...,...,...,...,...
31535,ZOM,557.034912,7295,serial,steep
31568,ZOM,561.514038,7295,serial,steep
31601,ZOM,558.232483,7295,serial,steep
31634,ZOM,560.279724,7295,serial,steep


# Phases prods

In [19]:
for key, df in dic_combined.items():
    df['SIZE'] = 7295
    df['LANG'] = key.split('-')[0]
    df['METHOD'] = key.split('-')[1] 
    
p_combined_dff = pd.concat(dic_combined.values(), ignore_index=True)
p_combined_dff = p_combined_dff[p_combined_dff['PHASE'].str.match(r'^P.\_')]

p_combined_dff_gpu = p_combined_dff[p_combined_dff['LANG'] != 'serial']
p_combined_dff_gpu.loc[p_combined_dff_gpu['PHASE'] == 'P6_SAVE_PRODS', 'TIMESTAMP'] += p_combined_dff_gpu.loc[p_combined_dff_gpu['PHASE'] == 'P5_COPY_HOST', 'TIMESTAMP'].values
p_combined_dff_gpu = p_combined_dff_gpu[p_combined_dff_gpu['PHASE'] != 'P5_COPY_HOST']

p_combined_dff_cpu = p_combined_dff[p_combined_dff['LANG'] == 'serial']

p_combined_dff = pd.concat([p_combined_dff_gpu, p_combined_dff_cpu], ignore_index=True)
p_combined_dff = rename_phases(p_combined_dff)

p_combined_dff = p_combined_dff.sort_values(['LANG', 'PHASE', 'METHOD'])

#select by METHOD 
p_sebal = p_combined_dff[p_combined_dff['METHOD'] == 'p_sebal'].dropna()
p_steep = p_combined_dff[p_combined_dff['METHOD'] == 'steep'].dropna()

#select by LANG
cpp_sebal = p_sebal[p_sebal['LANG'] == 'serial']
kernels_sebal = p_sebal[p_sebal['LANG'] == 'kernels']
hybrid_sebal = p_sebal[p_sebal['LANG'] == 'hybrid']
cutensor_sebal = p_sebal[p_sebal['LANG'] == 'cutensor']

cpp_steep = p_steep[p_steep['LANG'] == 'serial']
kernels_steep = p_steep[p_steep['LANG'] == 'kernels']
hybrid_steep = p_steep[p_steep['LANG'] == 'hybrid']
cutensor_steep = p_steep[p_steep['LANG'] == 'cutensor']

In [20]:
p_combined_dff['PHASE'].unique()

array(['FINAL_PROD', 'INITIAL_PROD', 'PIXEL_SEL', 'RAH', 'READ_INPUT',
       'SAVE_PRODS', 'TOTAL'], dtype=object)

## Total 

In [24]:
for key, df in dic_combined.items():
    df['SIZE'] = 7295
    df['LANG'] = key.split('-')[0]
    df['METHOD'] = key.split('-')[1] 
    
t_combined_dff = pd.concat(dic_combined.values(), ignore_index=True)
t_combined_dff = t_combined_dff[t_combined_dff['PHASE'] == 'P_TOTAL']
t_combined_dff = rename_phases(t_combined_dff)

In [25]:
t_combined_dff

Unnamed: 0,PHASE,TIMESTAMP,SIZE,LANG,METHOD
32,TOTAL,29861.90,7295,serial,steep
65,TOTAL,29828.10,7295,serial,steep
98,TOTAL,29796.10,7295,serial,steep
131,TOTAL,29825.10,7295,serial,steep
164,TOTAL,29893.30,7295,serial,steep
...,...,...,...,...,...
248558,TOTAL,3029.19,7295,hybrid,sebal
248589,TOTAL,3040.36,7295,hybrid,sebal
248620,TOTAL,3025.49,7295,hybrid,sebal
248651,TOTAL,3027.68,7295,hybrid,sebal


# Speedup

In [12]:
serial = p_combined_dff[p_combined_dff['LANG'] == 'serial'].groupby(['PHASE', 'METHOD'])['TIMESTAMP'].median()
cutensor = p_combined_dff[p_combined_dff['LANG'] == 'cutensor'].groupby(['PHASE', 'METHOD'])['TIMESTAMP'].median()
kernels = p_combined_dff[p_combined_dff['LANG'] == 'kernels'].groupby(['PHASE', 'METHOD'])['TIMESTAMP'].median()
hybrid = p_combined_dff[p_combined_dff['LANG'] == 'hybrid'].groupby(['PHASE', 'METHOD'])['TIMESTAMP'].median()

# create a df speedup with cutensor + kernels + hybrid
gpu_speedup = pd.DataFrame({
    'serial': serial,
    'cutensor': cutensor,
    'kernels': kernels,
    'hybrid': hybrid
})

gpu_speedup['cutensor'] = round(gpu_speedup['serial'] / gpu_speedup['cutensor'], 2)
gpu_speedup['kernels'] = round(gpu_speedup['serial'] / gpu_speedup['kernels'], 2)
gpu_speedup['hybrid'] = round(gpu_speedup['serial'] / gpu_speedup['hybrid'], 2)
# gpu_speedup['serial'] = round(gpu_speedup['serial'] / gpu_speedup['serial'], 2)

# Resetando o index para ter PHASE e METHOD como colunas
gpu_speedup_reset = gpu_speedup.reset_index()

# Convertendo para formato long
gpu_speedup_long = pd.melt(
    gpu_speedup_reset, 
    id_vars=['PHASE', 'METHOD'], 
    value_vars=['cutensor', 'kernels', 'hybrid'],
    var_name='LANG',
    value_name='SPEEDUP'
)

gpu_speedup_long = gpu_speedup_long.sort_values(['LANG', 'PHASE', 'METHOD'])
gpu_speedup_long

Unnamed: 0,PHASE,METHOD,LANG,SPEEDUP
0,FINAL_PROD,sebal,cutensor,106.04
1,FINAL_PROD,steep,cutensor,106.01
2,INITIAL_PROD,sebal,cutensor,96.46
3,INITIAL_PROD,steep,cutensor,96.07
4,PIXEL_SEL,sebal,cutensor,108.18
5,PIXEL_SEL,steep,cutensor,849.71
6,RAH,sebal,cutensor,39.64
7,RAH,steep,cutensor,47.17
8,READ_INPUT,sebal,cutensor,0.84
9,READ_INPUT,steep,cutensor,0.84


# Save dataframes

In [13]:
mean_df = consolidate_dataframes(dic_summarized, MAIN_KEY_SIZE)
mean_df.to_excel("landsat-mean.xlsx")
combined_dff.to_excel("landsat-prods.xlsx")
p_combined_dff.to_excel("landsat-phases.xlsx")
t_combined_dff.to_excel("landsat-total.xlsx")
gpu_speedup_long.to_excel("landsat-speedup.xlsx")

#concat combined_dff and p_combined_dff
all_dff = pd.concat([combined_dff, p_combined_dff], ignore_index=True)
all_dff.to_excel("landsat-all.xlsx")

