 # EMD

In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
from time import time 

n_runs = 2
plot_block_structure = True

for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    

    metrics = {}
    metrics['db'] = []
    metrics['pmi'] = []
    

    for _ in range(n_runs):
      M = -features * scaler
      
      Z_p, W_p = BCOT(M, n_clusters, algorithm='emd', reg=1, n_iter=100)

      Z = Z_p.argmax(-1)
      W = W_p.argmax(-1)

      try:
        metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
      except:
        metrics['db'].append(np.nan)
      metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
      
      
    results = {
      'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
      'std': {k:np.std(v).round(2) for k,v in metrics.items()}
    }
    
    
    means = results['mean']
    std = results['std']
    print(f'### Results on {dataset}')
    print(f'L(X)=-{scaler}X')
    print(f"pmi:{means['pmi']}±{std['pmi']}") 
    print(f"db-index: {means['db']}±{std['db']}")
    print()

/content/BCOT
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
### Results on wiki
L(X)=-1X
pmi:0.6215±0.01
db-index: 5.8533±0.16

### Results on wiki
L(X)=-2405X
pmi:0.6441±0.0
db-index: 5.8628±0.04

### Results on wiki
L(X)=-4973X
pmi:0.6407±0.02
db-index: 5.7067±0.14

### Results on wiki
L(X)=-23X
pmi:0.6226±0.01
db-index: 5.8±0.09

### Results on pubmed
L(X)=-1X
pmi:0.4943±0.06
db-index: 7.3612±0.03

### Results on pubmed
L(X)=-19717X
pmi:0.555±0.0
db-index: 7.2279±0.11

### Results on pubmed
L(X)=-500X
pmi:0.5623±0.0
db-index: 7.1131±0.04

### Results on pubmed
L(X)=-3X
pmi:0.564±0.0
db-index: 6.8047±0.06

### Results on acm
L(X)=-1X
pmi:0.271±0.01
db-index: 10.309±0.06

### Results on acm
L(X)=-3025X
pmi:0.2761±0.0
db-index: 10.1265±0.24

### Results on acm
L(X)=-1870X
pmi:0.2733±0.0
db-index: 10.2889±0.03

### Results on acm
L(X)=-18X
pmi:0.2779±0.01
db-index: 10.3369±0.05

### Results on dblp
L(X)=-1X
pmi:0.2048±0.04
db-index: 7.1357±0.73


# Sinkhorn 

In [None]:
%cd /content/BCOT
%load_ext autoreload
%autoreload 2

import sys

import scipy.sparse as sp
from bcot.utils import read_dataset
from bcot.bcot import BCOT
from sklearn.metrics import davies_bouldin_score
from bcot.metrics import average_pmi_per_cluster
import numpy as np
from time import time 

n_runs = 2
plot_block_structure = True

for dataset, n_clusters in zip(['wiki', 'pubmed', 'acm', 'dblp', 'ohscal', 'ng20'], [23, 3, 18, 2, 100, 140]):
  features, labels, _ = read_dataset(dataset, sparse=True)
  n, d = features.shape
  k = n_clusters

  for scaler in [1, n, d, k]:    
    for reg in [.0001, .001, .01, .1, 1, 1]:
      metrics = {}
      metrics['db'] = []
      metrics['pmi'] = []
      

      for _ in range(n_runs):
        M = -features * scaler
        
        Z_p, W_p = BCOT(M, n_clusters, algorithm='sinkhorn', reg=reg, n_iter=100)

        Z = Z_p.argmax(-1)
        W = W_p.argmax(-1)

        try:
          metrics['db'].append(davies_bouldin_score(features.toarray(), Z))
        except:
          metrics['db'].append(np.nan)
        metrics['pmi'].append(average_pmi_per_cluster(features.T, W))
        
        
      results = {
        'mean': {k:np.mean(v).round(4) for k,v in metrics.items()}, 
        'std': {k:np.std(v).round(2) for k,v in metrics.items()}
      }
      
      
      means = results['mean']
      std = results['std']
      print(f'### Results on {dataset}')
      print(f'L(X)=-{scaler}X')
      print(f"pmi:{means['pmi']}±{std['pmi']}") 
      print(f"db-index: {means['db']}±{std['db']}")
      print()