# N-Scaling Comparison

Compares genetic-algorithm accuracy vs. number of sensor nodes
for N=100, N=1000, and N=5000, all at the same average connectivity <K>
(fixed by gamma=1.8, N=5000 as the reference).

## Config

In [None]:
import pathlib

# Paths to the ga-results/ directories for each N.
# Each directory contains one CSV per network: 0.csv, 1.csv, ..., 49.csv
# Columns: original_network_idx, max_num_features, accuracy, features
GA_RESULTS = {
  100:  pathlib.Path('../../data/drug-n-scaling/N100/ga-results'),
  1000: pathlib.Path('../../data/drug-n-scaling/N1000/ga-results'),
  5000: pathlib.Path('../../data/drug-v0127d-N5k-gamma1.8-10drugs/ga-results'),
}

PLOT_OUT = pathlib.Path('../../plots/n-scaling-ga-accuracy-vs-features.png')

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.rcParams['text.usetex'] = False

## Load GA results

In [None]:
dfs = []
for N, results_dir in GA_RESULTS.items():
  csvs = sorted(results_dir.glob('*.csv'))
  if not csvs:
    print(f'WARNING: no CSVs found in {results_dir}')
    continue
  df = pd.concat([pd.read_csv(f) for f in csvs], ignore_index=True)
  df['N'] = N
  dfs.append(df)
  print(f'N={N}: {len(csvs)} network files, {len(df)} rows')

all_df = pd.concat(dfs, ignore_index=True)
all_df['N'] = all_df['N'].astype(str)   # categorical for seaborn hue
all_df.head()

## Plot: accuracy vs. number of sensor nodes

In [None]:
with sns.plotting_context('notebook', font_scale=2.0):
  fig, ax = plt.subplots(figsize=(12, 8))
  g = sns.lineplot(
    data=all_df,
    x='max_num_features',
    y='accuracy',
    hue='N',
    lw=4,
    marker='o',
    markersize=10,
    palette='tab10',
    ax=ax,
  )
  ax.set_xscale('log', base=2)
  ax.set_xticks([1, 2, 4, 8, 16, 32, 64, 128])
  ax.set_xticklabels([1, 2, 4, 8, 16, 32, 64, 128])
  ax.axhline(1.0, color='gray', lw=1, ls='--', alpha=0.5)
  ax.axhline(0.1, color='gray', lw=1, ls=':', alpha=0.5, label='random (10 drugs)')
  g.set(
    xlabel='Number of sensor nodes $k$',
    ylabel='Drug classification accuracy',
    title='Genetic algorithm sensor selection vs. network size $N$\n(same $\\langle K \\rangle$, 10 drugs, power-law)',
  )
  plt.tight_layout()

In [None]:
PLOT_OUT.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(PLOT_OUT, bbox_inches='tight', dpi=300)
print(f'saved to {PLOT_OUT}')

## Summary statistics

In [None]:
summary = (
  all_df
  .groupby(['N', 'max_num_features'])['accuracy']
  .agg(['mean', 'std', 'count'])
  .rename(columns={'mean': 'mean_accuracy', 'std': 'std_accuracy', 'count': 'n_networks'})
  .reset_index()
)
summary['stderr'] = summary['std_accuracy'] / np.sqrt(summary['n_networks'])
summary