# Report Aggregation

In [1]:
import os
import pandas as pd
import altair as alt

from dotenv import load_dotenv
from pymongo import MongoClient

In [2]:
def connect_to_mongodb(collection_name, secrets_path: str = os.path.join('.', 'secrets.env')):
    load_dotenv(secrets_path)  # Take environment variables from .env

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    connection_string = os.getenv("CONNECTION_STRING")
    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    client = MongoClient(connection_string)
    collection = client[os.getenv("DB_NAME")][collection_name]

    return client, collection

In [3]:
def get_experiment_results(experiment_name, save_locally: bool = True):
    results_path = os.path.join('..', 'results', experiment_name)
    if save_locally:
        client, collection = connect_to_mongodb(experiment_name)
        cursor = collection.find({})

        records = []
        for record in cursor:
            del record['_id']
            records.append(record)

        exp_results_df = pd.DataFrame(records)
        os.makedirs(results_path, exist_ok=True)
        exp_results_df.to_csv(os.path.join(results_path, 'exp_results.csv'), index=False)

    return pd.read_csv(os.path.join(results_path, 'exp_results.csv'))

## Read experiment results from the MongoDB

In [4]:
EXPERIMENT_FOLDER_NAME = 'experiment_1'

In [5]:
# exp_results_df = get_experiment_results(EXPERIMENT_FOLDER_NAME, save_locally=True)
exp_results_df = get_experiment_results(EXPERIMENT_FOLDER_NAME, save_locally=False)  # to reproduce results

In [6]:
exp_results_df['sending_datetime'] = pd.to_datetime(exp_results_df['sending_datetime'], format="%d-%m-%Y, %H:%M:%S.%f")
exp_results_df['processing_datetime'] = pd.to_datetime(exp_results_df['processing_datetime'], format="%d-%m-%Y, %H:%M:%S.%f")

In [7]:
exp_results_df['processing_time'] = (exp_results_df.processing_datetime - exp_results_df.sending_datetime).dt.total_seconds()
exp_results_df.head(10)

Unnamed: 0,reddit_id,sending_datetime,processing_datetime,size_in_mb,configuration,consumer_uuid,processing_time
0,2832,2023-08-19 15:38:30.472318,2023-08-19 15:38:31.512748,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,1.04043
1,2833,2023-08-19 15:38:30.508985,2023-08-19 15:38:32.577785,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,2.0688
2,2832,2023-08-19 15:38:30.512244,2023-08-19 15:38:33.585153,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,3.072909
3,2832,2023-08-19 15:38:30.514797,2023-08-19 15:38:34.599989,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,4.085192
4,2831,2023-08-19 15:38:30.517096,2023-08-19 15:38:35.617617,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,5.100521
5,2830,2023-08-19 15:38:30.519552,2023-08-19 15:38:36.631708,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,6.112156
6,2830,2023-08-19 15:38:30.522444,2023-08-19 15:38:37.645585,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,7.123141
7,2829,2023-08-19 15:38:30.525101,2023-08-19 15:38:38.660138,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,8.135037
8,2829,2023-08-19 15:38:30.527539,2023-08-19 15:38:39.673661,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,9.146122
9,2828,2023-08-19 15:38:30.529833,2023-08-19 15:38:40.688414,0.000343,prod1_part1_cons1,02e5cf2b-e4b7-40e8-9d67-3daf1fb96c9b,10.158581


In [8]:
data_for_plots_df = pd.DataFrame(columns=['configuration', 'throughput_per_sec', 'max_latency'])

for config in  exp_results_df['configuration'].unique():
    max_latency = exp_results_df[exp_results_df['configuration'] == config]['processing_time'].max()
    total_processing_time = (
        exp_results_df[exp_results_df['configuration'] == config]['processing_datetime'].max() -
        exp_results_df[exp_results_df['configuration'] == config]['sending_datetime'].min()
    ).total_seconds()
    throughput_per_sec = exp_results_df[exp_results_df['configuration'] == config]['size_in_mb'].sum() / total_processing_time

    data_for_plots_df.loc[len(data_for_plots_df.index)] = [config, throughput_per_sec, max_latency]

In [9]:
data_for_plots_df

Unnamed: 0,configuration,throughput_per_sec,max_latency
0,prod1_part1_cons1,0.000339,202.02852
1,prod1_part1_cons2,0.00034,201.352701
2,prod1_part2_cons2,0.000648,105.521583
3,prod1_part5_cons5,0.001384,49.110928
4,prod1_part10_cons1,0.00034,201.434826
5,prod1_part10_cons5,0.001619,41.959496
6,prod1_part10_cons10,0.002507,26.949177
7,prod2_part10_cons10,0.002609,25.609825


## Visualize an average throughput of the system in Mbps

In [10]:
alt.Chart(data_for_plots_df).mark_bar().encode(
    y=alt.Y("configuration", type="nominal", title='Configuration', sort=alt.Sort(field="throughput_per_sec", order="descending")),
    x=alt.X("throughput_per_sec", type='quantitative', title='Throughput in Mbps'),
).properties(
    width=650,
    height=600,
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)

## Visualize a max latency of message processing

In [11]:
alt.Chart(data_for_plots_df).mark_bar().encode(
    y=alt.Y("configuration", type="nominal", title='Configuration', sort=alt.Sort(field="max_latency", order="ascending")),
    x=alt.X("max_latency", type='quantitative', title='Latency in Seconds')
).properties(
    width=650,
    height=600,
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)