In [14]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist
from functools import partial

In [189]:
data_path = '/pscratch/sd/a/azaidi/new-dt/nsys/spec_freq_df.pkl'
df = pd.read_pickle(data_path)
df = df.sort_values('length_key').reset_index(drop=True)
df['num_unique'] = df.unique_species.map(len)
df.shape

(295617, 3)

In [171]:
df.head(2)

Unnamed: 0,length_key,unique_species,num_unique
0,20,[30420],1
1,23,[29255],1


In [168]:
spec_lbls_stacked = np.hstack(df.unique_species.values)
spec_lbls_stacked.shape, spec_lbls_stacked[:4]

((8240475,), array([30420, 29255, 29255, 33254]))

In [91]:
#plt.hist(df.length_key, bins=100);
#plt.hist(np.log10(df.length_key), bins=100);
#np.log10(df.length_key)[:10];

In [172]:
#this will bin our labels - since it's sorted we can easily extract indexes
bin_sizes , _ = np.histogram(np.log10(df.length_key), bins=100)
len(bin_sizes), bin_sizes[:4]

(100, array([1, 3, 2, 2]))

In [173]:
#this will get the start and stop index values for spec_lbls_stacked
idx_holder = [0,]
for count in bin_sizes:
    holder_len = len(idx_holder)
    idx_holder.append(idx_holder[holder_len - 1] + count)
idx_holder[:3], idx_holder[-3:]

([0, 1, 4], [295604, 295614, 295617])

In [174]:
def get_intersection_card(x,y):
    return len(np.intersect1d(x,y))

In [175]:
one_bin = spec_lbls_stacked[idx_holder[0]]
two_bin = spec_lbls_stacked[idx_holder[1]: idx_holder[2]]
one_bin, two_bin, get_intersection_card(one_bin, two_bin)

(30420, array([29255, 29255, 33254]), 0)

In [166]:
def get_binned_spec(idx_holder):
    bins, prev_idx = [], 0
    for idx in range(len(idx_holder[1:])):
        bins.append(spec_lbls_stacked[idx_holder[prev_idx]: idx_holder[idx]])
        prev_idx = idx
    bins.append(spec_lbls_stacked[idx_holder[-2]: idx_holder[-1]])
    return bins[1:]

In [176]:
#spec_lbls_stacked[idx_holder[-2]: idx_holder[-1]]

In [148]:
binned_species = get_binned_spec(idx_holder)
bin_lens = np.asarray([len(x) for x in binned_species])
len(binned_species), len(bin_lens) == len(bin_sizes)

(100, True)

In [55]:
#bin_func = partial(np.histogram, minlength = 65703)
#bin_matrix = np.stack(df.unique_species.map(bin_func).map(np.asarray))
#bin_matrix.shape

In [66]:
#def get_intersection_card(x,y):
    #return len(set(x).intersection(set(y)))
    #return len(np.intersect1d(x,y))
    #return len(x.intersection(y))

def get_union_card(x,y):
    return len(set(x).union(set(y)))

def get_jabccard_sim_score(x,y):
    intersect_card = get_intersection_card(x,y)
    union_card = get_union_card(x,y)
    return intersect_card/union_card

In [177]:
card_holder = np.zeros((len(binned_species), len(binned_species)))
card_holder.shape

for x in range(len(binned_species)):
    f = partial(get_intersection_card, binned_species[x])
    for y in range(x+1, len(binned_species)):
        card_val = f(binned_species[y])
        card_holder[x, y] = card_val

In [179]:
np.set_printoptions(precision=1, linewidth=140)
card_holder.astype(int)

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [190]:
#lets just check that there's some union between bins
card_holder.astype(int).sum(0)

array([    0,     0,     1,     2,     4,     4,     1,     7,     9,     6,    17,    19,    29,    28,    13,    13,    27,    53,    85,
          92,   128,   165,   179,   244,   268,   245,   270,   369,   450,   502,   555,   689,   748,   776,  1004,  1126,  1274,  1440,
        1510,  1301,   649,  1046,  1374,  1835,  2273,  2938,  3494,  4219,  5070,  6006,  7307,  7972,  9488, 10642, 12676, 14255, 16283,
       17405, 19964, 22670, 25447, 26680, 30005, 33431, 29989, 34117, 38234, 41765, 44535, 47192, 48894, 48959, 48357, 46775, 44045, 39953,
       22156, 19604, 17979, 15294, 12265, 10382,  9029,  8439,  7966,  5913,  6827,  8113,  9278,  8511,  9450,  7105,  5430,  4305,  2830,
        2252,  1213,   398,   174,    74])

In [193]:
#plot the same
#plt.hist(pd.DataFrame(card_holder).max(0));