I have modified the scripts where each clustering method and embedding method is independently added and can be called specifically based on what we specify. This notebook is to test this and make it ready for easy testing.

What to change in this code?  
- Create more runs
- Parallelise the runs too
- ~~Save the files in a separate folder experiment_mu_change_N/Run1/~~
- Create suitable output pipeline
- Then next experiment: do same for testing different embedding methods against proposed. Same format as above

Rewriting it without parallelisation for Snakemake

In [2]:
import time
import pandas as pd
import numpy as np
from copy import deepcopy
import sys
import os
import csv
from itertools import cycle
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
# warnings.resetwarnings() # To change it back (optional)

sys.path.append("/nobackup/gogandhi/alt_means_sans_k/")

from scripts.similarity_scores import get_scores



def process_and_save_result(run_no, mu, path_name, score_keys, device_name, emb_params, params,csv_file_path):
    start_time = time.perf_counter()

    if not os.path.isdir(f"{path_name}/Run_{run_no}/"):
        os.mkdir(f"{path_name}/Run_{run_no}/")

    params['mu'] = mu
    result_run_mu = get_scores(params, emb_params, score_keys, f"{path_name}/Run_{run_no}/", device_name)
    
    with open(csv_file_path, 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow([run_no, mu] + [result_run_mu[key] for key in score_keys])
        
    elapsed_time = time.perf_counter() - start_time
    
    print(run_no,mu,elapsed_time)
    return run_no, mu, result_run_mu

def save_accumulated_results(results, pathname, score_keys):
    for run_no, mu, result_run_mu in results:
        df = pd.DataFrame.from_dict(result_run_mu, orient='index')
        df.reset_index(inplace=True)
        df.columns = ['mu'] + list(df.columns[1:])
        df.to_csv(f"{pathname}/Run_{run_no}/mu_{mu:.2f}_change.csv", index=False)

accumulator = []  # List to accumulate results for each run and mu
N=10000
params = {
    "N": N,
    "k": 50,
    "maxk":  int(np.sqrt(10 * N)),
    "minc": 50,
    "maxc": int(np.ceil(np.sqrt(N * 10))),
    "tau": 3.0,
    "tau2": 1.0,
    "mu": 0.2,
    }
                

emb_params = {
    "method": "node2vec",
    "window_length": 10,
    "walk_length": 80,
    "num_walks": 10,
    "dim": 64,
}

#If you want to test faster for results, remove belief_prop, then optics and dbscan for now and add them separately.
score_keys = ['kmeans','dbscan', 'optics', 'xmeans', 'infomap', 'flatsbm', 'proposed']


num_cores = 10
runs = np.arange(1, 11)
runs = np.arange(5, 11)

mu_values = np.round(np.arange(0.05, 1.05, 0.05),decimals=2)


test_run=False

device_names = [f"cuda:{i}" for i in [0,1,2,3]]  # ['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3']

path_name = f"/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_{params['N']}_{params['k']}_{params['tau']}"
if test_run:
    path_name += "_test_run"


#################### End of Params #################

#if not os.path.isdir(path_name):
#    os.mkdir(path_name)

def create_unique_folder(base_folder):
    if os.path.exists(base_folder):
        index = 1
        while True:
            new_folder = f"{base_folder}_{index}"
            if not os.path.exists(new_folder):
                break
            index += 1
    else:
        new_folder = base_folder

    os.mkdir(new_folder)
    return new_folder
    
#path_name = create_unique_folder(path_name)
path_name =  "/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_10000_50_3.0_1"

csv_file_path = path_name + "/result_stream.csv"

print("Hello, you can find results at:\n",path_name)

# with open(csv_file_path, 'w', newline='') as csv_file:
#     csv_writer = csv.writer(csv_file)
#     csv_writer.writerow(['run_no', 'mu'] + score_keys)



for run_no in runs:
    start_time = time.perf_counter()

    for mu, device_name in zip(mu_values, cycle(device_names)):
        
        run_no, mu, result_run_mu = process_and_save_result(run_no, mu, path_name, score_keys, device_name, emb_params, deepcopy(params),csv_file_path)
        
    print(f"Run took: {time.perf_counter() - start_time}, avg time per mu_val: {(time.perf_counter() - start_time)/len(mu_values)}")


Hello, you can find results at:
 /nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_10000_50_3.0_1
5 0.05 164.7669743821025
5 0.1 118.78009455651045
5 0.15 118.82609823718667
5 0.2 126.00001654028893
5 0.25 157.67332595586777
5 0.3 138.677269205451
5 0.35 152.14769494906068
5 0.4 154.9739441163838
5 0.45 170.70759696140885
5 0.5 172.76596990600228
5 0.55 189.36443885415792
5 0.6 213.36360359936953
5 0.65 197.61871061101556
5 0.7 213.1211816035211
5 0.75 226.37256037443876
5 0.8 236.4940980784595
5 0.85 274.2598997503519
5 0.9 272.9711070545018
5 0.95 273.2656289972365
5 1.0 282.8761934041977
Run took: 3855.030106525868, avg time per mu_val: 192.75150549467654
6 0.05 127.62678641825914
6 0.1 136.26744224876165
6 0.15 194.93861585482955
6 0.2 121.92184786498547
6 0.25 123.48779284209013
6 0.3 131.66876627504826
6 0.35 147.63208881393075
6 0.4 160.40194790810347
6 0.45 161.38421788066626
6 0.5 174.2490831427276
6 0.55 556.738564401865
6 0.6 226.2406939715147
6 0.65 196.58965718

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

# path_name = f"/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_{params['N']}_{params['k']}_{params['tau']}_final"
path_name_temp ="/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_10000_50_3.0_old"
csv_file_path = path_name_temp + "/result_stream.csv"

result_df = pd.read_csv(csv_file_path)
df_grouped = result_df.groupby('mu').agg(['mean', 'std'])
plt.figure(figsize=(7,4))

for column in df_grouped.columns.levels[0][1:]:
    
    mean_values = df_grouped[column]['mean']
    std_values = df_grouped[column]['std']
    
    plt.plot(mean_values.index, mean_values, '-o',label=column)
    plt.fill_between(mean_values.index, mean_values - std_values, mean_values + std_values, alpha=0.2)

plt.xlabel(r'Mixing Parameter: $\mu$')
plt.ylabel('Element Centric Similarity')
plt.legend(title="Algorithm", loc='upper right', bbox_to_anchor=(1.3, 0.8))
plt.grid(True)
plt.tight_layout()

plt.title(rf'Runs: {len(runs)} | Nodes: {params["N"]} | $\tau$: {params["tau"]} | $<k>$: {params["k"]}')


# Save the figure
#plt.savefig(f"{path_name}/experiment_plot.png",bbox_inches='tight')

# Show the plot
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_10000_50_3.0_old/result_stream.csv'

In [11]:
result_df

Unnamed: 0,run_no,mu,kmeans,dbscan,optics,xmeans,infomap,flatsbm,proposed
0,1,0.05,0.955903,0.303812,0.018488,0.971185,1.000000,0.963294,0.821190
1,1,0.10,0.917544,0.026454,0.001451,0.824360,1.000000,0.956679,0.670608
2,1,0.15,0.836946,0.029834,0.000622,0.825488,1.000000,1.000000,0.553389
3,1,0.20,0.865986,0.041503,0.000214,0.802344,0.097955,1.000000,0.575628
4,1,0.25,0.934307,0.010645,0.000077,0.881747,0.881565,0.755116,0.442675
...,...,...,...,...,...,...,...,...,...
195,9,0.10,0.899374,0.146000,0.000429,0.729446,1.000000,0.935574,0.629615
196,9,0.15,0.871431,0.014925,0.000710,0.816328,0.999389,0.706087,0.702929
197,10,0.05,0.862873,0.274585,0.048256,0.859978,1.000000,0.912298,0.789600
198,10,0.10,0.999689,0.023493,0.000091,0.897715,1.000000,0.991292,0.727602


In [None]:
# MOdify this so that snakemake can be used to parallelize mu and runs.
# your_script.py
import argparse

def main():
    parser = argparse.ArgumentParser(description='Experiment: Changing Mu')
    #parser.add_argument('--input', required=True, help='Input file path')
    parser.add_argument('--output', required=True, help='Output file path')
    parser.add_argument('--mu', type=float, required=True, help='Value of mu')
    parser.add_argument('--runs', type=float, required=True, help='Value of mu')

    args = parser.parse_args()

    # Your script logic using args.input, args.output, and args.mu

if __name__ == "__main__":
    main()