In [1]:
from pyspark.sql import SparkSession
import os
import json
import re
import pandas as pd

spark = SparkSession.builder\
    .config("spark.driver.memory", "16g")\
    .getOrCreate()

spark

In [32]:
def throughput(root, mr, hot):
    directory = os.path.join(root, f"hot{hot}mr{mr}")
    throughputs = []
    for dirpath, _dirs, files in os.walk(directory):
        for file in files:
            if file.endswith("summary.json"):
                # Use regex to parse the pattern /region(\d+)/ from the dirpath
                # and store the result in a variable called region
                # res = re.search(r"region(\d+)", dirpath)
                res = re.search(r"(\d+)-us-.*$", dirpath)

                region = None
                if res:
                    region = res.group(1)

                if region is None:
                    continue

                rec = {
                    "region": region,
                    "mr": mr,
                    "hot": hot
                }
                
                with open(os.path.join(dirpath, file), 'r') as f:
                    data = json.load(f)
                    rec["throughput"] = data["Goodput (requests/second)"]
                
                throughputs.append(rec)
    return throughputs

DIR = "3-region"

res = []
for mr in [0, 5, 10, 25, 50, 75, 100]:
    for hot in [0, 1000, 500, 100]:
        res.extend(throughput(DIR, mr, hot))
        # print(f"hot: {hot}, mr: {mr}: {res}")

df = pd.DataFrame(res)

df.pivot(columns=["hot", "region"], index=["mr"], values="throughput")


hot,0,0,0,1000,1000,1000,500,500,500,100,100,100
region,1,3,2,1,3,2,1,3,2,1,3,2
mr,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,5389.596921,5740.852357,5797.902425,4602.255965,5438.51694,5680.835035,4600.725233,5376.556001,5052.040942,1040.276307,1274.657169,547.495586
5,989.856629,940.367769,984.932447,885.098687,765.011337,1062.367819,1160.03696,1023.879214,1114.817837,1015.700446,854.550354,1208.210353
10,1795.812967,1420.563209,2259.864786,445.33037,369.799639,454.624814,511.952902,507.384356,558.398697,764.407154,577.630914,767.109079
25,885.619562,644.564603,754.910324,771.241197,590.126041,1018.716398,162.696809,160.296679,173.896129,313.857679,275.731224,354.624277
50,458.295217,300.43275,532.899091,454.461822,294.658238,536.287864,62.599097,76.632742,85.764198,356.254431,231.525912,370.521559
75,299.290983,203.460376,347.089515,318.923105,194.028393,365.897925,294.05703,199.163185,362.693024,262.096095,166.595543,282.165423
100,239.82957,157.765169,276.966537,221.028623,142.465973,275.432225,244.59447,151.52814,264.897813,204.992886,125.365944,221.428324


In [35]:
df.pivot_table(columns=["hot"], index=["mr"], values="throughput", aggfunc="sum")


hot,0,100,500,1000
mr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16928.351704,2862.429062,15029.322176,15721.60794
5,2915.156845,3078.461153,3298.73401,2712.477843
10,5476.240962,2109.147147,1577.735956,1269.754823
25,2285.094489,944.21318,496.889617,2380.083636
50,1291.627058,958.301902,224.996037,1285.407924
75,849.840875,710.857062,855.913239,878.849423
100,674.561277,551.787154,661.020423,638.926821


In [5]:
def sum_histograms(directory):
    sum = {}
    for root, _dirs, files in os.walk(directory):
        for file in files:
            if file.endswith("histograms.json"):
                with open(os.path.join(root, file), 'r') as f:
                    data = json.load(f)
                    for key in data:
                        if key not in sum:
                            sum[key] = 0
                        sum[key] += data[key]["NUM_SAMPLES"]
    return sum

for mr in [0, 5, 10, 25, 50, 75, 100]:
    for hot in [0, 1000, 500, 100]:
        histogram = sum_histograms(os.path.join(DIR, f"hot{mr}mr{mr}"))
        total = 0
        for v in histogram.values():
            total += v
        print(f"mr: {mr}, hot: {hot}: {histogram}, total = {total}")


mr: 0, hot: 0: {'rejected': 10383, 'aborted': 0, 'unexpected': 0, 'completed': 507859}, total = 518242
mr: 0, hot: 1000: {'rejected': 10383, 'aborted': 0, 'unexpected': 0, 'completed': 507859}, total = 518242
mr: 0, hot: 500: {'rejected': 10383, 'aborted': 0, 'unexpected': 0, 'completed': 507859}, total = 518242
mr: 0, hot: 100: {'rejected': 10383, 'aborted': 0, 'unexpected': 0, 'completed': 507859}, total = 518242
mr: 5, hot: 0: {}, total = 0
mr: 5, hot: 1000: {}, total = 0
mr: 5, hot: 500: {}, total = 0
mr: 5, hot: 100: {}, total = 0
mr: 10, hot: 0: {}, total = 0
mr: 10, hot: 1000: {}, total = 0
mr: 10, hot: 500: {}, total = 0
mr: 10, hot: 100: {}, total = 0
mr: 25, hot: 0: {}, total = 0
mr: 25, hot: 1000: {}, total = 0
mr: 25, hot: 500: {}, total = 0
mr: 25, hot: 100: {}, total = 0
mr: 50, hot: 0: {}, total = 0
mr: 50, hot: 1000: {}, total = 0
mr: 50, hot: 500: {}, total = 0
mr: 50, hot: 100: {}, total = 0
mr: 75, hot: 0: {}, total = 0
mr: 75, hot: 1000: {}, total = 0
mr: 75, hot: 5

In [4]:
def count_deadlocks(directory):
    local = remote = 0
    for root, _dirs, files in os.walk(directory):
        for file in files:
            if file == "log.txt":
                with open(os.path.join(root, file), 'r') as f:
                    block = []
                    for line in f:
                        if "distributed deadlock" in line:
                            remote += 1
                        if "ERROR: deadlock detected" in line: 
                            block.append(line)
                        elif len(block) > 0:
                            block.append(line)
                            if "Hint: See server log for query details." in line:
                                local += 1
                                block.clear()


    return local, remote

for mr in [0, 5, 10, 25, 50, 75, 100]:
    for hot in [0, 1000, 500, 100, 10, 1]:
        local, remote = count_deadlocks(os.path.join(DIR, "mr{}hot{}".format(mr, hot)))
        print(f"mr: {mr}, hot: {hot}: {local} local, {remote} remote")


mr: 0, hot: 0: 0 local, 0 remote
mr: 0, hot: 1000: 5 local, 4 remote
mr: 0, hot: 500: 12 local, 8 remote
mr: 0, hot: 100: 56 local, 102 remote
mr: 0, hot: 10: 192 local, 766 remote
mr: 0, hot: 1: 0 local, 0 remote
mr: 5, hot: 0: 2 local, 1 remote
mr: 5, hot: 1000: 4 local, 2 remote
mr: 5, hot: 500: 7 local, 8 remote
mr: 5, hot: 100: 47 local, 33 remote
mr: 5, hot: 10: 231 local, 252 remote
mr: 5, hot: 1: 0 local, 358 remote
mr: 10, hot: 0: 1 local, 0 remote
mr: 10, hot: 1000: 1 local, 5 remote
mr: 10, hot: 500: 11 local, 7 remote
mr: 10, hot: 100: 41 local, 37 remote
mr: 10, hot: 10: 145 local, 189 remote
mr: 10, hot: 1: 0 local, 432 remote
mr: 25, hot: 0: 1 local, 0 remote
mr: 25, hot: 1000: 0 local, 2 remote
mr: 25, hot: 500: 2 local, 4 remote
mr: 25, hot: 100: 23 local, 38 remote
mr: 25, hot: 10: 90 local, 247 remote
mr: 25, hot: 1: 0 local, 483 remote
mr: 50, hot: 0: 0 local, 0 remote
mr: 50, hot: 1000: 0 local, 2 remote
mr: 50, hot: 500: 1 local, 4 remote
mr: 50, hot: 100: 6 local

In [38]:
def collect_summary(root, mr, hot):
    directory = os.path.join(root, f"hot{hot}mr{mr}")
    records = []
    for dirpath, _, files in os.walk(directory):
        for file in files:
            if file.endswith("summary.json"):
                # Use regex to parse the pattern /region(\d+)/ from the dirpath
                # and store the result in a variable called region
                # res = re.search(r"region(\d+)", dirpath)
                res = re.search(r"(\d+)-us-.*$", dirpath)

                region = None
                if res:
                    region = res.group(1)

                if region is None:
                    continue

                rec = {
                    "region": region,
                    "mr": mr,
                    "hot": hot,
                }

                with open(os.path.join(dirpath, file), 'r') as f:
                    data = json.load(f)
                    rec["throughput"] = data["Goodput (requests/second)"]
                    rec["p50"] = data["Latency Distribution"]["Median Latency (microseconds)"]
                    rec["p90"] = data["Latency Distribution"]["90th Percentile Latency (microseconds)"]
                    rec["p99"] = data["Latency Distribution"]["99th Percentile Latency (microseconds)"]
                    rec["p0"] = data["Latency Distribution"]["Minimum Latency (microseconds)"]
                    rec["p100"] = data["Latency Distribution"]["Maximum Latency (microseconds)"]

                records.append(rec)
    
    return records
                

summary = []

for mr in [0, 5, 10, 25, 50, 75, 100]:
    for hot in [0, 1000, 500, 100]:
        summary += collect_summary("3-region", mr, hot)

df = pd.DataFrame(summary)

In [53]:
df.loc[df.hot == 1000, ["mr", "hot", "region", "p99"]]\
    .sort_values(by=["mr", "region"]).pivot_table(index="mr", columns=["hot","region"], values="p99") / 1000

hot,1000,1000,1000
region,1,2,3
mr,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,6.794,6.015,5.938
5,328.36,269.673,383.296
10,467.56,380.167,514.002
25,172.241,116.105,148.685
50,160.355,135.19,179.265
75,161.616,141.846,188.057
100,178.126,147.182,194.498
