In [26]:
%cd /scratch/bruingjde/SNAM2021-code/

from constants import *

/scratch/bruingjde/SNAM2021-code


# Edge temporal features

In [2]:
def logistic_regression(X, y):
  X_train, X_test, y_train, y_test = (
    sklearn.model_selection.train_test_split(X, y))
  pipe = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.StandardScaler(),
    sklearn.linear_model.LogisticRegression(max_iter=10000, n_jobs=-1))
  pipe.fit(X_train, y_train)
  
  auc = sklearn.metrics.roc_auc_score(
    y_true=y_test, y_score=pipe.predict_proba(X_test)[:,1])
  
  return auc

class FeatureSet(typing.NamedTuple):
  time: str
  heuristic: str
  network: int

def get_features() -> dict[FeatureSet, typing.Union[np.ndarray, pd.DataFrame]]:
  aggregation_strategies = ['m0', 'm1', 'q0', 'q25', 'q50', 'q75', 'q100']
  # Get all features that will be used in the logistic_regression.
  # Start with the static ones. For each heuristic:
  features = {
    FeatureSet('static', heuristic, network_index): np.load(
      f'data/{network_index:02}/features/time_agnostic/{heuristic}.npy'
    ).reshape(-1,1)
    for heuristic in heuristics for network_index in network_indices
  }
  # Still static, but all heuristics combined:
  for network_index in network_indices:
    features[FeatureSet('static', 'combined', network_index)] = pd.DataFrame(
      {
        heuristic: np.load(
          f'data/{network_index:02}/features/time_agnostic/{heuristic}.npy')
        for heuristic in heuristics
      }
    )
    # Temporal edge, for each time_strategy, for each of the heuristics:
    for time_strategy in time_strategies:
      for heuristic in heuristics:
        if network_index in hypergraph_indices:
          # For each heuristic
          features[FeatureSet(time_strategy, heuristic, network_index)] = (
            pd.DataFrame(
              {
                aggregation_strategy: np.load(
                  f'data/{network_index:02}/features/time_edge/'
                  f'{heuristic}_{time_strategy}_{aggregation_strategy}.npy')
                for aggregation_strategy in aggregation_strategies
              }
            )
          )
          # Heuristics combined
          features[FeatureSet(time_strategy, 'combined', network_index)] = (
            pd.DataFrame(
              {
                (heuristic, aggregation_strategy): np.load(
                  f'data/{network_index:02}/features/time_edge/'
                  f'{heuristic}_{time_strategy}_{aggregation_strategy}.npy')
                for aggregation_strategy in aggregation_strategies
                for heuristic in heuristics
              }
            )
          )
        else: # No hypergraph
          # For each heuristic
          features[FeatureSet(time_strategy, heuristic, network_index)] = (
            np.load(
              f'data/{network_index:02}/features/time_edge/'
              f'{heuristic}_{time_strategy}.npy'
            ).reshape(-1,1)
          )
          # Heuristics combined
          features[FeatureSet(time_strategy, 'combined', network_index)] = (
            pd.DataFrame(
              {
                heuristic: np.load(f'data/{network_index:02}/features/'
                                   f'time_edge/{heuristic}_{time_strategy}.npy')
                for heuristic in heuristics
              }
            )
          )
    # Temporal edge, all time_strategies combined, but per heuristic:
    for heuristic in heuristics:
      if network_index in hypergraph_indices:
        # For each heuristic
        features[FeatureSet('combined', heuristic, network_index)] = (
          pd.DataFrame(
            {
              aggregation_strategy: np.load(
                f'data/{network_index:02}/features/time_edge/'
                f'{heuristic}_{time_strategy}_{aggregation_strategy}.npy')
              for aggregation_strategy in aggregation_strategies
              for time_strategy in time_strategies
            }
          )
        )
        # Heuristics combined
        features[FeatureSet('combined', 'combined', network_index)] = (
          pd.DataFrame(
            {
              (heuristic, aggregation_strategy): np.load(
                f'data/{network_index:02}/features/time_edge/'
                f'{heuristic}_{time_strategy}_{aggregation_strategy}.npy')
              for aggregation_strategy in aggregation_strategies
              for time_strategy in time_strategies
              for heuristic in heuristics
            }
          )
        )
      else: # No hypergraph
        # For each heuristic
        features[FeatureSet('combined', heuristic, network_index)] = (
          pd.DataFrame(
            {
              heuristic: np.load(f'data/{network_index:02}/features/time_edge/'
                                 f'{heuristic}_{time_strategy}.npy')
              for time_strategy in time_strategies
            }
          )
        )

        # Heuristics combined
        features[FeatureSet('combined', 'combined', network_index)] = pd.DataFrame(
          {
            heuristic: np.load(f'data/{network_index:02}/features/time_edge/'
                               f'{heuristic}_{time_strategy}.npy')
            for heuristic in heuristics
            for time_strategy in time_strategies
          }
        )
  return features

In [40]:
def get_agg_performance():
  aggregation_strategies = ['m0', 'm1', 'q0', 'q25', 'q50', 'q75', 'q100']
  return {
    (aggregation_strategy, heuristic, network_index): np.load(
    f'data/{network_index:02}/features/time_edge/'
    f'{heuristic}_exp_{aggregation_strategy}.npy').reshape(-1, 1)
    for aggregation_strategy in aggregation_strategies
    for heuristic in heuristics
    for network_index in hypergraph_indices
  }

In [41]:
featuresets = get_agg_performance()

In [42]:
auc_featuresets = [
  {
    'auc': logistic_regression(
      X=featureset, 
      y=np.load(f'data/{featureset_id[2]:02}/targets_sampled.npy')),
    'aggregation': featureset_id[0],
    'heuristic': featureset_id[1],
    'network': featureset_id[2]
  }
  for featureset_id, featureset in tqdm(featuresets.items())
]

  0%|          | 0/448 [00:00<?, ?it/s]

In [50]:
df = pd.DataFrame(auc_featuresets)
# df['auc'] = auc_featuresets.values()
# df = df.groupby(['aggregation', 'heuristic'])['auc'].mean()

In [58]:
df.groupby(['network', 'aggregation'])['auc'].mean().unstack().round(2)

aggregation,m0,m1,q0,q100,q25,q50,q75
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.7,0.67,0.65,0.69,0.66,0.67,0.68
2,0.75,0.76,0.75,0.76,0.76,0.75,0.76
3,0.72,0.75,0.74,0.75,0.75,0.75,0.75
5,0.65,0.62,0.6,0.63,0.61,0.62,0.63
6,0.73,0.72,0.72,0.73,0.72,0.73,0.73
7,0.62,0.7,0.68,0.7,0.69,0.69,0.7
12,0.8,0.82,0.82,0.83,0.82,0.82,0.83
13,0.63,0.65,0.63,0.64,0.64,0.64,0.64
14,0.81,0.8,0.8,0.8,0.8,0.8,0.8
19,0.81,0.84,0.83,0.84,0.84,0.84,0.84
