In [1]:
import time
import pandas as pd
import numpy as np
from copy import deepcopy
import sys
import os
import csv
from itertools import cycle
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
# warnings.resetwarnings() # To change it back (optional)

sys.path.append("/nobackup/gogandhi/alt_means_sans_k/")

from scripts.similarity_scores import get_scores
from scripts.nets_and_embeddings import *

In [2]:
N=10000
params = {
    "N": N,
    "k": 50,
    "maxk":  int(np.sqrt(10 * N)),
    "minc": 50,
    "maxc": int(np.ceil(np.sqrt(N * 10))),
    "tau": 3.0,
    "tau2": 1.0,
    "mu": 0.2,
    }


emb_params = {
    "method": "node2vec",
    "window_length": 10,
    "walk_length": 80,
    "num_walks": 10,
    "dim": 512,
}


num_cores = 10

runs = np.arange(1, 11)

test_run=False

device_names = [f"cuda:{i}" for i in [0,1,2,3]]  # ['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3']

path_name = f"/nobackup/gogandhi/alt_means_sans_k/data/experiment_dim_change_{params['N']}_{params['k']}_{params['tau']}"
if test_run:
    path_name += "_test_run"


#################### End of Params #################

#if not os.path.isdir(path_name):
#    os.mkdir(path_name)

def create_unique_folder(base_folder):
    if os.path.exists(base_folder):
        index = 1
        while True:
            new_folder = f"{base_folder}_{index}"
            if not os.path.exists(new_folder):
                break
            index += 1
    else:
        new_folder = base_folder

    os.mkdir(new_folder)
    return new_folder
    
path_name = create_unique_folder(path_name)


In [None]:
# First let's create networks and embeddings first, and then run clustering:
runs = np.arange(1, 11)
mu_values = np.round(np.arange(0, 1.05, 0.05),decimals=2)
dimensions = [512,256,128,64,32]

for dimension in dimensions:
    
    start_dim_time = time.perf_counter()
    temp_emb_params = deepcopy(emb_params)
    temp_emb_params['dim']= dimension
    
    for run_no in runs:
        start_time = time.perf_counter()
        

        if not os.path.exists(f"{path_name}/Run_{run_no}/"):
            os.mkdir(f"{path_name}/Run_{run_no}/")
        
        for mu in mu_values:
            
            temp_params = deepcopy(params)
            
            temp_params['mu']= mu
            
            net, comm, emb = create_and_save_network_and_embedding(temp_params,temp_emb_params, f"{path_name}/Run_{run_no}/")
            print(dimension, run_no, mu)

        print(f"Run took: {time.perf_counter() - start_time}, avg time per mu_val: {(time.perf_counter() - start_time)/len(mu_values)}")
    print(f"Dim {dimension} took:{time.perf_counter() - start_dim_time} ") 

512 1 0.0
512 1 0.05
512 1 0.1
512 1 0.15
512 1 0.2
512 1 0.25
512 1 0.3
512 1 0.35
512 1 0.4
512 1 0.45
512 1 0.5
512 1 0.55
512 1 0.6
512 1 0.65
512 1 0.7
512 1 0.75
512 1 0.8
512 1 0.85
512 1 0.9
512 1 0.95
512 1 1.0
Run took: 2076.2177998740226, avg time per mu_val: 98.86751478954795
512 2 0.0
512 2 0.05
512 2 0.1
512 2 0.15
512 2 0.2
512 2 0.25
512 2 0.3
512 2 0.35
512 2 0.4
512 2 0.45
512 2 0.5
512 2 0.55
512 2 0.6
512 2 0.65
512 2 0.7
512 2 0.75
512 2 0.8
512 2 0.85
512 2 0.9
512 2 0.95
512 2 1.0
Run took: 2109.115478331223, avg time per mu_val: 100.43407094780179
512 3 0.0
512 3 0.05
512 3 0.1
512 3 0.15
512 3 0.2
512 3 0.25
512 3 0.3
512 3 0.35
512 3 0.4
512 3 0.45
512 3 0.5
512 3 0.55
512 3 0.6
512 3 0.65
512 3 0.7
512 3 0.75
512 3 0.8
512 3 0.85
512 3 0.9
512 3 0.95
512 3 1.0
Run took: 2067.2258409876376, avg time per mu_val: 98.43932634033263
512 4 0.0
512 4 0.05
512 4 0.1
512 4 0.15
512 4 0.2
512 4 0.25
512 4 0.3
512 4 0.35
512 4 0.4
512 4 0.45


In [None]:
# Once all networks are gotten, get all scores. 
# We are doing 5 x 10 x 20 nets for some 6
# and get their similarity scores.

def process_and_save_result(run_no, mu, dim, path_name, score_keys, device_name, emb_params, params,csv_file_path):
    start_time = time.perf_counter()

    if not os.path.isdir(f"{path_name}/Run_{run_no}/"):
        os.mkdir(f"{path_name}/Run_{run_no}/")

    params['mu'] = mu
    emb_params['dim'] = dim
    
    result_run_mu = get_scores(params, emb_params, score_keys, f"{path_name}/Run_{run_no}/", device_name)
    
    with open(csv_file_path, 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow([run_no, mu, dim] + [result_run_mu[key] for key in score_keys])
        
    elapsed_time = time.perf_counter() - start_time
    
    print(run_no,mu,dim, elapsed_time)
    return run_no, mu, result_run_mu

In [None]:
#If you want to test faster for results, remove belief_prop, then optics and dbscan for now and add them separately.
score_keys = ['kmeans', 'proposed']
runs = np.arange(1, 11)
mu_values = np.round(np.arange(0, 1.05, 0.05),decimals=2)
dimensions = [512,256,128,64,32]

csv_file_path = path_name + "/result_stream.csv"

print("Hello, you can find results at:\n",path_name)

with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['run_no', 'mu','dim'] + score_keys)
    
    

for run_no in runs:
    start_dim_time = time.perf_counter()

    for dimension in dimensions:

        start_time = time.perf_counter()

        for mu, device_name in zip(mu_values, cycle(device_names)):

            run_no, mu, result_run_mu = process_and_save_result(run_no, mu,dimension, path_name, score_keys, device_name, deepcopy(emb_params), deepcopy(params), csv_file_path)

        print(f"Run took: {time.perf_counter() - start_time}, avg time per mu_val: {(time.perf_counter() - start_time)/len(mu_values)}")
    
    print(f"Run took: {time.perf_counter() - start_dim_time}")

In [35]:
#path_name = path_name + "immutable"

In [None]:
import matplotlib.pyplot as plt
#def plotting_mu_change(path_name, params):

csv_file_path = path_name + "/result_stream.csv"

result_df = pd.read_csv(csv_file_path)
runs = result_df['run_no'].unique()
df_grouped = result_df.groupby(['mu', 'dim']).agg(['mean', 'std'])

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot for the "proposed" column

for i in range(len(axes)):
    ax0 = axes[i]
    for dim_value in df_grouped.index.levels[1]:
        index = [i for i,j in list(df_grouped.loc(axis=0)[:, dim_value][score_keys[i]]['mean'].index)]
        mean_values = df_grouped.loc(axis=0)[:, dim_value][score_keys[i]]['mean']
        std_values = df_grouped.loc(axis=0)[:, dim_value][score_keys[i]]['std']

        ax0.plot(index, mean_values, '-o', label=f'dim={dim_value}')
        ax0.fill_between(index, mean_values - std_values, mean_values + std_values, alpha=0.2)

    ax0.set_xlabel(r'Mixing Parameter: $\mu$')
    ax0.set_ylabel(f"Element Centric Similarity")
    ax0.legend(title="Dimension", loc='upper right', bbox_to_anchor=(1.3, 0.8))
    ax0.grid(True)
    ax0.set_title(f"Method: {score_keys[i]}")
    
plt.suptitle(rf'Runs: {len(runs)} | Nodes: {params["N"]} | $\tau$: {params["tau"]} | $<k>$: {params["k"]}')

# Adjust layout
plt.tight_layout()

# Save the figure
plt.savefig(f"{path_name}/changing_dimensions_plot.png", bbox_inches='tight')
#plt.close()

#    returnzv