# Data refactoring

The aim of this script is to recompile the entire data in a single CSV file

In [None]:
target_dir = { 
    "No_noise_static" : "my_results/194202_mia_nonoise_static_niidattacks/",
    "No_noise_dynamic" : "my_results/1942107_mia_nonoise_dynamic_niidattacks/",

    "Gaussian64_static": "my_results/1943115_static_gaussiannoise64th_niidattacks/",
    "Gaussian64_dynamic" : "my_results/1943116_dynamic_gaussiannoise64th_niidattacks/",
    "Gaussian32_static": "my_results/1942103_mia_gaussian32th_static_niidattacks/",
    "Gaussian32_dynamic" : "my_results/1942104_mia_gaussian32th_dynamic_niidattacks/",
    "Gaussian16_static" : "my_results/1943162_static_gaussiannoise16th_niidattacks/",
    "Gaussian16_dynamic" : "my_results/1943163_dynamic_gaussiannoise16th_niidattacks/",
    "Gaussian8_static": "my_results/1942108_mia_gaussian8th_static_niidattacks/",
    "Gaussian8_dynamic": "my_results/1942109_mia_gaussian8th_dynamic_niidattacks/",
    "Gaussian4_static": "my_results/1942701_static_gaussiannoise4th_niidattacks/",
    "Gaussian4_dynamic" : "my_results/1942702_dynamic_gaussiannoise4th_niidattacks/",
    "Gaussian2_static": "my_results/1942845_static_gaussiannoise2th_niidattacks/",
    "Gaussian2_dynamic" : "my_results/1942843_dynamic_gaussiannoise2th_niidattacks/",

    "ZeroSum64_static": "my_results/1943117_static_zerosum64th_niidattacks/" ,
    "ZeroSum64_dynamic" : "my_results/1943118_dynamic_zerosum64th_niidattacks/",
    "ZeroSum32_static":"my_results/1942105_mia_zerosum32th_static_niidattacks/" ,
    "ZeroSum32_dynamic" : "my_results/1942106_mia_zerosum32th_dynamic_niidattacks/",
    "ZeroSum16_static": "my_results/1943164_static_zerosum16th_niidattacks/",
    "ZeroSum16_dynamic" : "my_results/1943165_dynamic_zerosum16th_niidattacks/",
    "ZeroSum8_static": "my_results/1942112_mia_zerosum8th_static_niidattacks/",
    "ZeroSum8_dynamic" : "my_results/1942113_mia_zerosum8th_dynamic_niidattacks/",
    "ZeroSum4_static": "my_results/1942703_static_zerosum4th_niidattacks/",
    "ZeroSum4_dynamic" : "my_results/1942704_dynamic_zerosum4th_niidattacks/",
    "ZeroSum2_static": "my_results/1942844_static_zerosum2th_niidattacks/",
    "ZeroSum2_dynamic" : "my_results/1942846_dynamic_zerosum2th_niidattacks/",


    "Muffliato64_static": "my_results/1954498_static_muffliato_niidattacks_64th/",
    "Muffliato64_dynamic" : "my_results/1954499_dynamic_muffliato_niidattacks_64th/",
    "Muffliato32_static": "my_results/1954496_static_muffliato_niidattacks_32th/",
    "Muffliato32_dynamic" : "my_results/1954497_dynamic_muffliato_niidattacks_32th/",
    "Muffliato16_static" : "my_results/1954494_static_muffliato_niidattacks_16th/",
    "Muffliato16_dynamic" : "my_results/1954495_dynamic_muffliato_niidattacks_16th/",
    "Muffliato8_static": "my_results/1954501_static_muffliato_niidattacks_8th/",
    "Muffliato8_dynamic": "my_results/1954500_dynamic_muffliato_niidattacks_8th/",
    "Muffliato4_static": "my_results/1954503_static_muffliato_niidattacks_4th/",
    "Muffliato4_dynamic" : "my_results/1954502_dynamic_muffliato_niidattacks_4th/",
    "Muffliato2_static": "my_results/1954505_static_muffliato_niidattacks_2th/",
    "Muffliato2_dynamic" : "my_results/1954504_dynamic_muffliato_niidattacks_2th/",
}

folder_name = "formatted_results/36nodes"


TOTAL_PROCESSES = 36
MAX_MACHINES =  3
STARTING_ITERATION = 0
MAX_ITERATIONS=4000


In [None]:
import pandas as pd
import numpy as np
from numpy import nansum
from numpy import nanmean

import os
import threading
import concurrent.futures
import matplotlib.pyplot as plt

assert TOTAL_PROCESSES%MAX_MACHINES == 0
MAX_PROCESSES = TOTAL_PROCESSES//MAX_MACHINES

machine_folder = 'machine{}'
result_file = '{}_results.json'



def load_data(dir):
    data = pd.DataFrame({})
    for machine in range(MAX_MACHINES):
        for rank in range(MAX_PROCESSES):
            print(f"Loading results for machine {machine} and rank {rank}.  ",end = "\r")
            uid = rank + machine * MAX_PROCESSES

            file = os.path.join(dir, machine_folder.format(machine), result_file.format(rank))
            tmp_df = pd.read_json(file)
            tmp_df["uid"] = uid # Manually add the uid for further processing                                                                   
            tmp_df["iteration"] = tmp_df.index
            # print(tmp_df)
            tmp_df = tmp_df[tmp_df["iteration"]>=STARTING_ITERATION]
            tmp_df = tmp_df[tmp_df["iteration"]<=MAX_ITERATIONS]
            data = pd.concat([data,tmp_df])
    return data
    
# Load the data
# data_dict = {}
# for (key, dir) in target_dir.items():
#     print(f"Loading results for run {key} at folder {dir}.  ")
#     data_dict[key] = load_data(dir)
# print("Loading finished!" + " "*40)

            

# Load privacy data as well

In [None]:
LOCATIONS_OF_ATTACKS = ["PRE-STEP","PRE-STEP-niid"]



assert TOTAL_PROCESSES%MAX_MACHINES == 0
MAX_PROCESSES = TOTAL_PROCESSES//MAX_MACHINES

machine_folder = 'machine{}'
privacy_folder = 'privacy'
summary_folder = 'summary'
process_folder = '{}'

def load_privacy_data(path_dir):
    data = {}

    for loc in LOCATIONS_OF_ATTACKS:
        location = f"privacy-summary-{loc}.json"
        data[loc] = pd.DataFrame({})  
        for machine in range(MAX_MACHINES):
            for rank in range(MAX_PROCESSES):
                print(f"Loading {location} for machine {machine} and rank {rank}  ",end = "\r")
                file = os.path.join(path_dir, machine_folder.format(machine),privacy_folder, summary_folder, process_folder.format(machine*MAX_PROCESSES+rank), location)
                tmp_df = pd.read_json(file)
                tmp_df = tmp_df[tmp_df.iteration <= MAX_ITERATIONS]
                tmp_df = tmp_df[tmp_df.iteration >= STARTING_ITERATION]
                #tmp_df['location_of_attack']= file.split('.')[0]
                data[loc] = pd.concat([data[loc],tmp_df])
    return data

# privacy_data_dict = {}
# for key,dir in target_dir.items():
#     print(f"Loading privacy data for {key} at \"{dir}\"")
#     privacy_data_dict[key] = load_privacy_data(dir)
# print("Loading finished!" + " "*40)

# data_dict

In [None]:
columns = ['iteration','test_acc', 'test_niid_acc']
agg_methods = ["mean", "min", "max"]
privacy_columns = ['iteration', 'Attacker advantage']


def format_data(key,dir):
    filename = f"{key}.csv"
    output_path = f"{folder_name}/{filename}"
    if filename in os.listdir(folder_name):
        print(f"Data for {key} already formatted!")
        return
    print(f"Started formatting data for {key}" + " " *40)
    gen_data = load_data(dir)
    usable_data = gen_data[columns].dropna()
    
    usable_data = usable_data.groupby('iteration').agg(agg_methods)
    usable_data.reset_index(inplace=True)

    usable_data.insert(1,"experience_name",key)

    usable_data.columns = [' '.join(e) if len(e[-1])>0 else e[0] for e in usable_data.columns]

    usable_data.set_index("iteration",inplace=True)


    privacy_data = load_privacy_data(dir)
    for loc in LOCATIONS_OF_ATTACKS:
        privacy_data_loc = privacy_data[loc]

        privacy_data_loc = privacy_data_loc[privacy_data_loc["slice feature"] == "Entire dataset"]
        averaged = privacy_data_loc[privacy_columns].groupby('iteration').agg(["mean"])
        averaged.columns = list(map(' '.join, averaged.columns.values))
        averaged.reset_index(inplace=True)
        
        # averaged.drop(1,axis=1)
        # print(averaged["Attacker advantage"])
        averaged.rename(columns = {('Attacker advantage mean'): f"Attacker advantage mean {loc}"}, errors="raise", inplace=True)
        # print(usable_data)
        # print(privacy_data_loc)
        # privacy_data 
        averaged.set_index("iteration",inplace=True)

        # print(usable_data)
        # print(averaged)
        usable_data = usable_data.join(averaged, on = 'iteration')
        # print("Joined:")
        # print(usable_data)

    # print(usable_data)

    # Save the formatted result 
    
    usable_data.to_csv(output_path)
    print(f"Formatted data for {key} at {output_path}" + " " *40)
    return


with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    for key,dir in target_dir.items():
        executor.submit(format_data,key,dir)
        # format_data(key,dir)