In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pandas import merge, concat, read_csv, DataFrame, Series, isnull, date_range, read_excel, MultiIndex, cut
from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial import distance
from scipy.cluster import hierarchy
from sklearn import cluster,metrics
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
# distance metrics

def fS1(x):
    mesk = x>0.
    return -np.sum( x[mesk]*np.log(x[mesk]) )

def fS2(x, y):
    return fS1( 0.5*(x+y) ) - 0.5*( fS1(x) + fS1(y) )

In [None]:
# read example input file

fname_in = './data/example_exposure_input.csv'
df = read_csv(fname_in, index_col=0)
df.head()

In [None]:
# parse and process data into an incidenge matrix. Each row = exposure vector

ls = sorted( df['source'].unique() )
ns = len(ls)
print ('sources', ns)

D = {}
for x in df['target'].unique():
    
    temp = df[df['target'] == x].copy()
    temp = temp.set_index('source', verify_integrity=True).reindex(ls).fillna(0.)['value'].values
    
    temp = temp/temp.sum()
    D[x] = temp.copy()
    
lt = sorted(list(D.keys()))
nt = len(lt)
print ('targets', nt)

data = np.row_stack([D[_] for _ in lt])
data = DataFrame(data, index=lt, columns=ls)
print (data.shape)
data.head()

In [None]:
# compute pairwise distances using the previously defined metrics

MD = pairwise_distances(data, metric=fS2)
print (MD.shape)

In [None]:
# agglomerative clustering

MDc = MD[np.triu_indices_from(MD, 1)]
Z = hierarchy.linkage(MDc, method='complete')

In [None]:
# plot dendrogram, with cut

AGGLO_CUT = 0.5  # set cut height

fig,ax = plt.subplots(figsize=(13,10))
labels = data.index
res = hierarchy.dendrogram(Z, labels=labels, leaf_rotation=90,ax=ax, color_threshold=AGGLO_CUT*max(Z[:,2]))

ax.axhline(AGGLO_CUT*max(Z[:,2]), linestyle='--', color='k')

for s in ax.get_xticklabels():
    s.set_weight('bold')


In [None]:
# assign to clusters, at given cut (previously set)

data_cl = DataFrame(hierarchy.fcluster(Z, AGGLO_CUT*max(Z[:,2]), criterion='distance')-1,
                    index=data.index,columns=['agglomerative'])
N_cluster = len( data_cl['agglomerative'].unique() )
print ('number of clusters: {}'.format( N_cluster ) )
clusizes = mydf.countcol(data_cl, 'agglomerative').set_index('agglomerative')['COUNT']
data_cl = data_cl['agglomerative']
data_cl.head()

In [None]:
# save data

DataFrame({'cluster': data_cl}).to_csv('./data/exposure_out/cluster_assignment.csv')
DataFrame({'size': clusizes}).to_csv('./data/exposure_out/cluster_size.csv')
DataFrame(MD, index=data.index, columns=data.index).to_csv('./data/exposure_out/pairwise_distance.csv')