## Analysis: unsupervised agglomerative Euclidean-distance based clustering of Oxbench and Homstrad, compared to all baselines.
This notebook was used to creat the clustering results files used to plot. By defualt, the results are not saved (overwriting the pre-packaged result) unless you uncommont the save lines. 

This is computationally intensive and takes tens of minutes-hours to run on a 16G laptop.

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
import scipy.cluster.hierarchy as hier
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
import random
import os
import Levenshtein
import fastcluster as fc
# To allow imports from common directory
sys.path.append('../../')
from common.embedding_tools import lev_dist_matrix, lev_sim_matrix

%matplotlib inline
random.seed(42)
np.random.seed(42)


In [3]:
homstrad = pd.read_pickle("../../data/homstrad_w_baseline.pkl")
homstrad

Unnamed: 0,sequence,phenotype,is_train,is_test,phenotype_name,dataset,RGN,64_avg_hidden,64_final_hidden,64_final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
20668,TKIVKVTGDYALLEFKDDLTGKGSICAETTAILKYLSEKGIKTHLV...,SAICAR_synt,True,True,family_name,homstrad,"[-0.025834012776613232, -0.05399760976433755, ...","[-0.11867085099220274, 0.11209627985954283, -0...","[-0.11023015528917313, 0.061099682003259666, -...","[-0.16718728840351105, 0.4113061130046845, -0....",...,"[-0.11867085099220274, 0.11209627985954283, -0...","[-0.16718728840351105, 0.4113061130046845, -0....","[-0.025834012776613232, -0.05399760976433755, ...","[-0.025834012776613232, -0.05399760976433755, ...","[0.05263157894736842, 0.014354066985645933, 0....","[0.05263157894736842, 0.014354066985645933, 0....","[1, 0, 0, 3, 0, 1, 0, 1, 0, 2, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04026548332025606, 0.0, 0.0, 0.126771468660...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20669,SITKTELDGILPLVARGKVRDIYEVDAGTLLFVATDRISAYDVIME...,SAICAR_synt,True,True,family_name,homstrad,"[0.13893805444240567, -0.00019246361625846478,...","[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.048800613731145866, 0.09936266392469406, -...","[-0.08486960828304291, 0.5393546223640442, -0....",...,"[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.08486960828304291, 0.5393546223640442, -0....","[0.13893805444240567, -0.00019246361625846478,...","[0.13893805444240567, -0.00019246361625846478,...","[0.06040268456375839, 0.003355704697986577, 0....","[0.06040268456375839, 0.003355704697986577, 0....","[1, 0, 1, 2, 0, 1, 0, 0, 3, 0, 0, 1, 1, 1, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03242841044691856, 0.0, 0.03571512882338177...","[0.0, 0.0, 0.0, 0.04342385073884401, 0.0, 0.0,..."
20670,GVTVTSHREYLTQVNNSSGFVVNGGIVGNSLQLNPSNGTLFSWLPA...,bv,True,True,family_name,homstrad,"[-0.2107011675834656, 0.02743645198643208, -0....","[0.0143950954079628, 0.14230819046497345, -0.1...","[0.06821956485509872, 0.18811896443367004, -0....","[0.13774429261684418, 0.8134044408798218, -0.1...",...,"[0.0143950954079628, 0.14230819046497345, -0.1...","[0.13774429261684418, 0.8134044408798218, -0.1...","[-0.2107011675834656, 0.02743645198643208, -0....","[-0.2107011675834656, 0.02743645198643208, -0....","[0.08139534883720931, 0.011627906976744186, 0....","[0.08139534883720931, 0.011627906976744186, 0....","[0, 0, 2, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.10974000969089705, 0.052284840727...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20671,SSMDVTILSHCELSTELAVTVTIVVTSELVMPFTVGTWLRGVAQNW...,bv,True,True,family_name,homstrad,"[-0.1806732267141342, 0.039818041026592255, -0...","[0.011167618446052073, 0.1276816874742508, -0....","[0.102280355989933, 0.08621235936880113, -0.15...","[0.1770242303609848, 0.5491780638694763, -0.18...",...,"[0.011167618446052073, 0.1276816874742508, -0....","[0.1770242303609848, 0.5491780638694763, -0.18...","[-0.1806732267141342, 0.039818041026592255, -0...","[-0.1806732267141342, 0.039818041026592255, -0...","[0.09547738693467336, 0.020100502512562814, 0....","[0.09547738693467336, 0.020100502512562814, 0....","[2, 0, 1, 0, 0, 0, 0, 3, 0, 2, 1, 1, 0, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.08210906485157748, 0.0, 0.04521553459337313...","[0.04794958851519534, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20672,GAITVLHCELTAEIGVTDSIVVSSELVMPYTVGTWLRGVADNWSKY...,bv,True,True,family_name,homstrad,"[-0.1597319394350052, 0.06173299625515938, -0....","[0.0020451024174690247, 0.1371651589870453, -0...","[0.11017193645238876, 0.1414894163608551, -0.1...","[0.1897858828306198, 0.6649361252784729, -0.17...",...,"[0.0020451024174690247, 0.1371651589870453, -0...","[0.1897858828306198, 0.6649361252784729, -0.17...","[-0.1597319394350052, 0.06173299625515938, -0....","[-0.1597319394350052, 0.06173299625515938, -0....","[0.09183673469387756, 0.02040816326530612, 0.0...","[0.09183673469387756, 0.02040816326530612, 0.0...","[1, 0, 3, 1, 0, 3, 0, 2, 0, 2, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04108234370949983, 0.0, 0.13573849390672096...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.048856888589355524..."
20673,AAATSLVYDTCYVTLTERATTSFQRQSFPTLKGMGDRAFQVVAFTI...,bv,True,True,family_name,homstrad,"[-0.053042154759168625, -0.06021036952733993, ...","[0.049401644617319114, 0.14267222583293915, -0...","[0.12890928983688354, 0.1764889657497406, -0.0...","[0.22785061597824094, 0.6632839441299438, -0.0...",...,"[0.049401644617319114, 0.14267222583293915, -0...","[0.22785061597824094, 0.6632839441299438, -0.0...","[-0.053042154759168625, -0.06021036952733993, ...","[-0.053042154759168625, -0.06021036952733993, ...","[0.10638297872340426, 0.014184397163120567, 0....","[0.10638297872340426, 0.014184397163120567, 0....","[3, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 3, 0, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.15765389725018375, 0.0, 0.0, 0.055150671786...","[0.05773102713212317, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20674,SISQQTVWNQMATVRTPLNFDSSKQSFCQFSVDLLGGGISVDKTGD...,bv,True,True,family_name,homstrad,"[-0.1995099782943725, 0.05840854719281197, -0....","[0.014398585073649885, 0.09075652807950974, -0...","[0.07805189490318297, 0.061298828572034836, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...",...,"[0.014398585073649885, 0.09075652807950974, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...","[-0.1995099782943725, 0.05840854719281197, -0....","[-0.1995099782943725, 0.05840854719281197, -0....","[0.05405405405405406, 0.021621621621621623, 0....","[0.05405405405405406, 0.021621621621621623, 0....","[2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.08568200526443001, 0.0746405865590282, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20675,KAIKAWTGYSVSKWTASCAAAEAKVTSAITISLPNELSSERNKQLK...,bv,True,True,family_name,homstrad,"[-0.06763646006584167, 0.10793761163949966, 0....","[-0.036410845816135406, 0.10667729377746582, -...","[0.018149545416235924, 0.20563025772571564, -0...","[0.047857824712991714, 0.8198820352554321, -0....",...,"[-0.036410845816135406, 0.10667729377746582, -...","[0.047857824712991714, 0.8198820352554321, -0....","[-0.06763646006584167, 0.10793761163949966, 0....","[-0.06763646006584167, 0.10793761163949966, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[7, 0, 2, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, ...","[2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0.3652303532054341, 0.0, 0.1149278755014895, ...","[0.1163332517506968, 0.0, 0.06655393273580669,..."
20676,TMRAVKRMINTHLEHKRFALINSGNTNATAGTVQNLSNGIIQGDDI...,bv,True,True,family_name,homstrad,"[-0.1897951513528824, -0.028467625379562374, -...","[-0.0044975951313972464, 0.1004331111907959, -...","[0.01018504239618778, 0.1202806606888771, -0.1...","[0.016981407999992367, 0.625932514667511, -0.1...",...,"[-0.0044975951313972464, 0.1004331111907959, -...","[0.016981407999992367, 0.625932514667511, -0.1...","[-0.1897951513528824, -0.028467625379562374, -...","[-0.1897951513528824, -0.028467625379562374, -...","[0.07065217391304347, 0.005434782608695652, 0....","[0.07065217391304347, 0.005434782608695652, 0....","[1, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.043856354023303804, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20677,KPKLLYCSNGGYFLRILPDGTVDGTKDRSDQHIQLQLAAESIGEVY...,intb,True,True,family_name,homstrad,"[0.18745867908000946, -0.2971647083759308, -0....","[-0.049511555582284934, 0.10908841341733932, -...","[-0.07744679600000381, 0.06857240200042725, -0...","[-0.21872881054878235, 0.2799902558326721, -0....",...,"[-0.049511555582284934, 0.10908841341733932, -...","[-0.21872881054878235, 0.2799902558326721, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.03937007874015748, 0.015748031496062992, 0....","[0.03937007874015748, 0.015748031496062992, 0....","[1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.05639104338020663, 0.0, 0.0, 0.118360686773...","[0.0, 0.0, 0.0, 0.06495775619934394, 0.0, 0.0,..."


In [4]:
homstrad.phenotype.value_counts()

glob                    41
az                      29
sermam                  27
igvar-l                 26
alpha-amylase           23
alpha-amylase_NC        23
alpha-amylase_C         21
igvar-h                 21
rrm                     20
sh3                     20
toxin                   20
toxin_2                 18
phoslip                 18
fabp                    17
proteasome              17
lipocalin               15
zf-CCHH                 15
kinase                  15
ldh                     14
fer4                    14
fn3                     14
gluts                   14
MHC_II_N                13
PH                      13
fer2                    13
response_reg            13
uce                     13
cys                     13
sdr                     13
asp                     13
                        ..
arrestin_C               2
Ribosomal_L2             2
SRP54M                   2
SLT_beta                 2
PBP                      2
ghf2                     2
R

In [4]:
homstrad.columns

Index(['sequence', 'phenotype', 'is_train', 'is_test', 'phenotype_name',
       'dataset', 'RGN', '64_avg_hidden', '64_final_hidden', '64_final_cell',
       '256_avg_hidden', '256_final_cell', 'avg_hidden', 'final_hidden',
       'final_cell', 'arnold_original_3_7', 'arnold_scrambled_3_5',
       'arnold_random_3_7', 'arnold_uniform_4_1', 'all_64', 'all_256',
       'all_1900', 'all_avg_hidden', 'all_final_cell', 'RGN_avg_hidden',
       'RGN_final_cell', 'simple_freq_plus', 'simple_freq_and_len', '2grams',
       '3grams', 'tfidf_2grams', 'tfidf_3grams'],
      dtype='object')

In [2]:
oxbench = pd.read_pickle("../../data/oxbench_w_baseline.pkl")
oxbench

Unnamed: 0,sequence,phenotype,is_train,is_test,phenotype_name,dataset,RGN,64_avg_hidden,64_final_hidden,64_final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
27393,MDLWQLLLTLALAGSSDAFSGSEATAAILSRAPWSLQSVNPGLKTN...,22,True,True,family_name,oxbench,"[0.003600145224481821, 0.025242097675800323, -...","[-0.00834772549569607, 0.0889512151479721, -0....","[-0.04558498039841652, 0.03235473856329918, -0...","[-0.09217455983161926, 0.12753142416477206, -0...",...,"[-0.00834772549569607, 0.0889512151479721, -0....","[-0.09217455983161926, 0.12753142416477206, -0...","[0.003600145224481821, 0.025242097675800323, -...","[0.003600145224481821, 0.025242097675800323, -...","[0.05238095238095238, 0.025396825396825397, 0....","[0.05238095238095238, 0.025396825396825397, 0....","[3, 1, 4, 2, 1, 5, 1, 2, 1, 3, 0, 1, 3, 2, 0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0.05100042974133817, 0.028602491303131788, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.026203018941676013..."
27394,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,22,True,True,family_name,oxbench,"[0.25183904170989985, -0.052813358604907996, 0...","[-0.0014204996405169368, 0.08998129516839981, ...","[0.027559755370020863, 0.08061934262514114, -0...","[0.05026482045650482, 0.4808682799339294, -0.2...",...,"[-0.0014204996405169368, 0.08998129516839981, ...","[0.05026482045650482, 0.4808682799339294, -0.2...","[0.25183904170989985, -0.052813358604907996, 0...","[0.25183904170989985, -0.052813358604907996, 0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[1, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 3, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.026867117701836703, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27395,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,22,True,True,family_name,oxbench,"[-0.043975099921226495, -0.005007729399949311,...","[0.02654711529612541, 0.17966338992118835, -0....","[-0.028605015948414806, 0.09775915741920473, 0...","[-0.07925857603549957, 0.34497514367103577, 0....",...,"[0.02654711529612541, 0.17966338992118835, -0....","[-0.07925857603549957, 0.34497514367103577, 0....","[-0.043975099921226495, -0.005007729399949311,...","[-0.043975099921226495, -0.005007729399949311,...","[0.041353383458646614, 0.02631578947368421, 0....","[0.041353383458646614, 0.02631578947368421, 0....","[1, 0, 0, 1, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.033501708829874785, 0.0, 0.0, 0.03691750482...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27396,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,22,True,True,family_name,oxbench,"[0.14047604799270627, 0.008294813334941864, 0....","[-0.03485559672117233, 0.1692134439945221, -0....","[-0.11898050457239152, 0.09711188822984697, -0...","[-0.19364985823631287, 0.5102627277374268, -0....",...,"[-0.03485559672117233, 0.1692134439945221, -0....","[-0.19364985823631287, 0.5102627277374268, -0....","[0.14047604799270627, 0.008294813334941864, 0....","[0.14047604799270627, 0.008294813334941864, 0....","[0.06299212598425197, 0.011811023622047244, 0....","[0.06299212598425197, 0.011811023622047244, 0....","[1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 2, 1, 2, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0342783545244663, 0.0, 0.0, 0.0755466728631...","[0.0, 0.0, 0.0, 0.044676579619048744, 0.0, 0.0..."
27397,MARFVALVLLGLLSLSGLDAIQRPPKIQVYSRHPPEDGKPNYLNCY...,22,True,True,family_name,oxbench,"[0.018975054845213887, 0.10635514557361601, 0....","[0.006666326429694891, 0.17566773295402527, -0...","[0.06674619764089584, 0.07585495710372925, -0....","[0.12799696624279022, 0.394978791475296, -0.28...",...,"[0.006666326429694891, 0.17566773295402527, -0...","[0.12799696624279022, 0.394978791475296, -0.28...","[0.018975054845213887, 0.10635514557361601, 0....","[0.018975054845213887, 0.10635514557361601, 0....","[0.03389830508474576, 0.01694915254237288, 0.0...","[0.03389830508474576, 0.01694915254237288, 0.0...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.06161104452264824, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27398,MARSVTLVFLVLVSLTGLYAIQKTPQIQVYSRHPPENGKPNILNCY...,22,True,True,family_name,oxbench,"[0.002841918263584376, 0.050634853541851044, 0...","[0.0014422480016946793, 0.15699413418769834, -...","[0.009508081711828709, 0.06540079414844513, -0...","[0.01557605527341366, 0.5317347645759583, -0.2...",...,"[0.0014422480016946793, 0.15699413418769834, -...","[0.01557605527341366, 0.5317347645759583, -0.2...","[0.002841918263584376, 0.050634853541851044, 0...","[0.002841918263584376, 0.050634853541851044, 0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09264823152933792, 0.0, 0.060680790967...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27399,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...,22,True,True,family_name,oxbench,"[-0.0021714118774980307, 0.055213350802659995,...","[0.004032405093312264, 0.20427122712135315, -0...","[0.06158576533198356, 0.10102695226669313, -0....","[0.0948815494775772, 0.6061667203903198, -0.25...",...,"[0.004032405093312264, 0.20427122712135315, -0...","[0.0948815494775772, 0.6061667203903198, -0.25...","[-0.0021714118774980307, 0.055213350802659995,...","[-0.0021714118774980307, 0.055213350802659995,...","[0.05042016806722689, 0.01680672268907563, 0.0...","[0.05042016806722689, 0.01680672268907563, 0.0...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09790075337444082, 0.0, 0.064120977303...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27400,MARSVTVIFLVLVSLAVVLAIQKTPQIQVYSRHPPENGKPNFLNCY...,22,True,True,family_name,oxbench,"[-0.005820003803819418, 0.06477358937263489, 0...","[-0.00240542134270072, 0.14602795243263245, -0...","[0.0586380772292614, 0.08074440807104111, -0.1...","[0.09595677256584167, 0.5423543453216553, -0.2...",...,"[-0.00240542134270072, 0.14602795243263245, -0...","[0.09595677256584167, 0.5423543453216553, -0.2...","[-0.005820003803819418, 0.06477358937263489, 0...","[-0.005820003803819418, 0.06477358937263489, 0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09362422835490287, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27401,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,22,True,True,family_name,oxbench,"[0.24240769445896146, -0.046102594584226615, 0...","[-0.01980241201817989, 0.08724614977836609, -0...","[-0.0008938438841141759, 0.07596740871667862, ...","[-0.0016647623851895332, 0.4509478211402893, -...",...,"[-0.01980241201817989, 0.08724614977836609, -0...","[-0.0016647623851895332, 0.4509478211402893, -...","[0.24240769445896146, -0.046102594584226615, 0...","[0.24240769445896146, -0.046102594584226615, 0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[1, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 3, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.02701248717806643, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27402,METPAWPRVPRPETAVARTLLLGWVFAQVAGASGTTNTVAAYNLTW...,22,True,True,family_name,oxbench,"[0.11495273560285567, -0.001244348124600947, 0...","[0.030945295467972755, 0.19996538758277893, -0...","[-0.08462587743997574, 0.07004468888044357, -0...","[-0.15981990098953247, 0.28453075885772705, -0...",...,"[0.030945295467972755, 0.19996538758277893, -0...","[-0.15981990098953247, 0.28453075885772705, -0...","[0.11495273560285567, -0.001244348124600947, 0...","[0.11495273560285567, -0.001244348124600947, 0...","[0.05460750853242321, 0.017064846416382253, 0....","[0.05460750853242321, 0.017064846416382253, 0....","[1, 0, 0, 0, 0, 4, 0, 1, 1, 0, 0, 0, 0, 1, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.030361207746987518, 0.0, 0.0, 0.0, 0.0, 0.1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [3]:
oxbench.phenotype.value_counts()

12     83
136    34
588    30
581    16
620    15
22     15
86     12
10     12
429    11
24     11
43     11
140    10
57     10
437    10
139     9
78      9
307     9
553     9
104     8
341     7
93      7
490     7
61      6
197     6
92      6
74      6
66      6
414     6
34      6
30      6
       ..
123     2
268     2
232     2
439     2
604     2
265     2
181     2
63      2
676     2
649     2
7       2
409     2
146     2
83      2
117     2
4       1
452     1
370     1
361     1
259     1
231     1
13      1
542     1
275     1
641     1
77      1
134     1
25      1
84      1
137     1
Name: phenotype, Length: 180, dtype: int64

Ok the pseudo code here is as follows:

Oxbench

1. Let the phenotype (family name) be the class label
2. Altogether (as a single clustering):
    - Perform Agglomerative clustering on reps, baselines, and on Levenshtein
    - Compute clustering accuracy using sklearn metrics (various)

Homstrad

1. Assign each phenotype name a unique class index integer
2. Altogether (as a single clustering):
    - Perform Agglomerative clustering on reps, baselines, and on Levenshtein
    - Compute clustering accuracy using sklearn metrics (various)

Should save the metrics of the result in a .csv for figure making later.

In [5]:
#  Write some helpers I can use for all of these

def cluster(df, labels, n_clusters=2):
    col_names = []
    ag_ari = []
    ag_fmi = []
    ag_ami = []
    
    for col in df:
        col = df[col]
        col_names.append(col.name)
        # print(col.name)
        precomputed = False
        metric = "euclidean"
        affinity = "rbf"
        if type(col.iloc[0]) == str:
            dist_or_rep = lev_dist_matrix(col.values)
            sim_or_rep = lev_sim_matrix(col.values)
            metric = "precomputed"
            affinity = "precomputed"
            
        elif (type(col.iloc[0][0]) == float) or (type(col.iloc[0][0]) == int):
            dist_or_rep = np.array(col.values.tolist())
            sim_or_rep = np.array(col.values.tolist())
            
        else:
            print("Types aren't right")

        ag_labels = AgglomerativeClustering(n_clusters=n_clusters, affinity=metric, linkage="average").fit_predict(dist_or_rep)
        ag_ari.append(metrics.adjusted_rand_score(labels, ag_labels))
        ag_fmi.append(metrics.fowlkes_mallows_score(labels,ag_labels))
        ag_ami.append(metrics.adjusted_mutual_info_score(labels,ag_labels))
        
    results = pd.DataFrame(
            [ag_ari, ag_fmi, ag_ami], 
            index=["ag_ari", "ag_fmi", "ag_ami"],
            columns=col_names)
    return results
    

In [6]:
column_idxs = list(range(len(oxbench.columns)))
# filter out Nans from RGN reps
filtered = oxbench[np.isnan(np.asarray(oxbench.RGN.values.tolist())).sum(axis=1) == 0]
X = filtered.iloc[:,[0] + column_idxs[6:]]
display(X)
labels = filtered['phenotype']
display(labels)
ox_results = cluster(X, labels, n_clusters=len(labels.unique()))
display(ox_results)

Unnamed: 0,sequence,RGN,64_avg_hidden,64_final_hidden,64_final_cell,256_avg_hidden,256_final_cell,avg_hidden,final_hidden,final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
27393,MDLWQLLLTLALAGSSDAFSGSEATAAILSRAPWSLQSVNPGLKTN...,"[0.003600145224481821, 0.025242097675800323, -...","[-0.00834772549569607, 0.0889512151479721, -0....","[-0.04558498039841652, 0.03235473856329918, -0...","[-0.09217455983161926, 0.12753142416477206, -0...","[0.06582998484373093, -0.010285131633281708, 0...","[-0.22898444533348086, 0.2156061679124832, 0.6...","[0.005700650159269571, -0.07126913219690323, 0...","[0.006025296170264482, -0.016268398612737656, ...","[3.2503571510314937, -3.0733962059021, 2.65597...",...,"[-0.00834772549569607, 0.0889512151479721, -0....","[-0.09217455983161926, 0.12753142416477206, -0...","[0.003600145224481821, 0.025242097675800323, -...","[0.003600145224481821, 0.025242097675800323, -...","[0.05238095238095238, 0.025396825396825397, 0....","[0.05238095238095238, 0.025396825396825397, 0....","[3, 1, 4, 2, 1, 5, 1, 2, 1, 3, 0, 1, 3, 2, 0, ...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0.05100042974133817, 0.028602491303131788, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.026203018941676013..."
27394,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,"[0.25183904170989985, -0.052813358604907996, 0...","[-0.0014204996405169368, 0.08998129516839981, ...","[0.027559755370020863, 0.08061934262514114, -0...","[0.05026482045650482, 0.4808682799339294, -0.2...","[0.003477461636066437, -0.003463610773906112, ...","[-0.7014604806900024, 0.06881146878004074, 0.7...","[0.005707088392227888, -0.21211794018745425, 0...","[-0.00029585888842120767, -0.2588478326797485,...","[-0.05024346709251404, -0.3519186973571777, 8....",...,"[-0.0014204996405169368, 0.08998129516839981, ...","[0.05026482045650482, 0.4808682799339294, -0.2...","[0.25183904170989985, -0.052813358604907996, 0...","[0.25183904170989985, -0.052813358604907996, 0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[1, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 3, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.026867117701836703, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27395,MVCLKLPGGSCMTALTVTLMVLSSPLALAGDTRPRFLWQLKFECHF...,"[-0.043975099921226495, -0.005007729399949311,...","[0.02654711529612541, 0.17966338992118835, -0....","[-0.028605015948414806, 0.09775915741920473, 0...","[-0.07925857603549957, 0.34497514367103577, 0....","[0.14094708859920502, -0.018250830471515656, 0...","[-0.14241290092468262, -0.07946618646383287, 0...","[0.0059544784016907215, -0.07907302677631378, ...","[0.003999337088316679, -0.027300657704472545, ...","[8.110466003417969, -8.007341384887695, 5.3289...",...,"[0.02654711529612541, 0.17966338992118835, -0....","[-0.07925857603549957, 0.34497514367103577, 0....","[-0.043975099921226495, -0.005007729399949311,...","[-0.043975099921226495, -0.005007729399949311,...","[0.041353383458646614, 0.02631578947368421, 0....","[0.041353383458646614, 0.02631578947368421, 0....","[1, 0, 0, 1, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.033501708829874785, 0.0, 0.0, 0.03691750482...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27396,MAISGVPVLGFFIIAVLMSAQESWAIKEEHVIIQAEFYLNPDQSGE...,"[0.14047604799270627, 0.008294813334941864, 0....","[-0.03485559672117233, 0.1692134439945221, -0....","[-0.11898050457239152, 0.09711188822984697, -0...","[-0.19364985823631287, 0.5102627277374268, -0....","[-0.11201996356248856, -0.011148834601044657, ...","[0.048699859529733665, 0.06631530821323395, 0....","[0.005423860624432564, -0.0654156431555748, 0....","[0.0024862033315002923, -0.07649094611406326, ...","[2.6525259017944336, -6.285686016082764, 3.079...",...,"[-0.03485559672117233, 0.1692134439945221, -0....","[-0.19364985823631287, 0.5102627277374268, -0....","[0.14047604799270627, 0.008294813334941864, 0....","[0.14047604799270627, 0.008294813334941864, 0....","[0.06299212598425197, 0.011811023622047244, 0....","[0.06299212598425197, 0.011811023622047244, 0....","[1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 2, 1, 2, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0342783545244663, 0.0, 0.0, 0.0755466728631...","[0.0, 0.0, 0.0, 0.044676579619048744, 0.0, 0.0..."
27397,MARFVALVLLGLLSLSGLDAIQRPPKIQVYSRHPPEDGKPNYLNCY...,"[0.018975054845213887, 0.10635514557361601, 0....","[0.006666326429694891, 0.17566773295402527, -0...","[0.06674619764089584, 0.07585495710372925, -0....","[0.12799696624279022, 0.394978791475296, -0.28...","[0.1055820882320404, -0.03934027627110481, 0.1...","[-0.10029956698417664, 0.1059226542711258, 0.7...","[0.009937506169080734, -0.13556525111198425, 0...","[0.0003730765311047435, -0.04427051916718483, ...","[2.1458282470703125, -0.07997488975524902, 7.3...",...,"[0.006666326429694891, 0.17566773295402527, -0...","[0.12799696624279022, 0.394978791475296, -0.28...","[0.018975054845213887, 0.10635514557361601, 0....","[0.018975054845213887, 0.10635514557361601, 0....","[0.03389830508474576, 0.01694915254237288, 0.0...","[0.03389830508474576, 0.01694915254237288, 0.0...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.06161104452264824, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27398,MARSVTLVFLVLVSLTGLYAIQKTPQIQVYSRHPPENGKPNILNCY...,"[0.002841918263584376, 0.050634853541851044, 0...","[0.0014422480016946793, 0.15699413418769834, -...","[0.009508081711828709, 0.06540079414844513, -0...","[0.01557605527341366, 0.5317347645759583, -0.2...","[0.03838978335261345, -0.034490175545215614, 0...","[-0.04301981255412102, 0.009146473370492458, 0...","[0.0098629891872406, -0.12381474673748015, 0.1...","[0.0007589790038764478, -0.034913647919893265,...","[2.0451877117156982, -0.06663747131824492, 9.1...",...,"[0.0014422480016946793, 0.15699413418769834, -...","[0.01557605527341366, 0.5317347645759583, -0.2...","[0.002841918263584376, 0.050634853541851044, 0...","[0.002841918263584376, 0.050634853541851044, 0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09264823152933792, 0.0, 0.060680790967...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27399,MSRSVALAVLALLSLSGLEAIQRTPKIQVYSRHPAENGKSNFLNCY...,"[-0.0021714118774980307, 0.055213350802659995,...","[0.004032405093312264, 0.20427122712135315, -0...","[0.06158576533198356, 0.10102695226669313, -0....","[0.0948815494775772, 0.6061667203903198, -0.25...","[0.0900874212384224, -0.04993068054318428, 0.1...","[-0.10339823365211487, 0.10406378656625748, 0....","[0.009906668215990068, -0.08453499525785446, 0...","[0.0005560183199122548, -0.018690047785639763,...","[2.0751473903656006, -0.03439563512802124, 7.6...",...,"[0.004032405093312264, 0.20427122712135315, -0...","[0.0948815494775772, 0.6061667203903198, -0.25...","[-0.0021714118774980307, 0.055213350802659995,...","[-0.0021714118774980307, 0.055213350802659995,...","[0.05042016806722689, 0.01680672268907563, 0.0...","[0.05042016806722689, 0.01680672268907563, 0.0...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09790075337444082, 0.0, 0.064120977303...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27400,MARSVTVIFLVLVSLAVVLAIQKTPQIQVYSRHPPENGKPNFLNCY...,"[-0.005820003803819418, 0.06477358937263489, 0...","[-0.00240542134270072, 0.14602795243263245, -0...","[0.0586380772292614, 0.08074440807104111, -0.1...","[0.09595677256584167, 0.5423543453216553, -0.2...","[0.0832686498761177, -0.030621949583292007, 0....","[-0.051673129200935364, 0.07451008260250093, 0...","[0.009634437970817087, -0.08685997873544693, 0...","[0.0005554933450184761, -0.021484104916453358,...","[1.7872135639190674, -0.04302289336919785, 9.1...",...,"[-0.00240542134270072, 0.14602795243263245, -0...","[0.09595677256584167, 0.5423543453216553, -0.2...","[-0.005820003803819418, 0.06477358937263489, 0...","[-0.005820003803819418, 0.06477358937263489, 0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0.04201680672268908, 0.01680672268907563, 0.0...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.09362422835490287, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27401,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,"[0.24240769445896146, -0.046102594584226615, 0...","[-0.01980241201817989, 0.08724614977836609, -0...","[-0.0008938438841141759, 0.07596740871667862, ...","[-0.0016647623851895332, 0.4509478211402893, -...","[0.007965680211782455, -0.003561092540621757, ...","[-0.6581687927246094, 0.08237023651599884, 0.7...","[0.005678036250174046, -0.21439264714717865, 0...","[-0.0003516568976920098, -0.2602613866329193, ...","[-0.059789001941680915, -0.3547399938106537, 8...",...,"[-0.01980241201817989, 0.08724614977836609, -0...","[-0.0016647623851895332, 0.4509478211402893, -...","[0.24240769445896146, -0.046102594584226615, 0...","[0.24240769445896146, -0.046102594584226615, 0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[0.03939393939393939, 0.02727272727272727, 0.0...","[1, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 3, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.02701248717806643, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27402,METPAWPRVPRPETAVARTLLLGWVFAQVAGASGTTNTVAAYNLTW...,"[0.11495273560285567, -0.001244348124600947, 0...","[0.030945295467972755, 0.19996538758277893, -0...","[-0.08462587743997574, 0.07004468888044357, -0...","[-0.15981990098953247, 0.28453075885772705, -0...","[0.09553498029708862, -0.022166106849908832, 0...","[0.29506951570510864, 0.04478519037365914, 0.2...","[0.0042785643599927425, -0.02433863840997219, ...","[0.007458804175257683, -0.08045482635498047, 0...","[5.051926612854004, -6.714621067047119, 4.0199...",...,"[0.030945295467972755, 0.19996538758277893, -0...","[-0.15981990098953247, 0.28453075885772705, -0...","[0.11495273560285567, -0.001244348124600947, 0...","[0.11495273560285567, -0.001244348124600947, 0...","[0.05460750853242321, 0.017064846416382253, 0....","[0.05460750853242321, 0.017064846416382253, 0....","[1, 0, 0, 0, 0, 4, 0, 1, 1, 0, 0, 0, 0, 1, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.030361207746987518, 0.0, 0.0, 0.0, 0.0, 0.1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


27393     22
27394     22
27395     22
27396     22
27397     22
27398     22
27399     22
27400     22
27401     22
27402     22
27403     22
27404     22
27405     22
27406     22
27407     22
27408    150
27409    150
27410    150
27411    150
27412     57
27413     57
27414     57
27415     57
27416     57
27417     57
27418     57
27419     57
27420     57
27421     57
27422    547
        ... 
28174    136
28175    136
28176    136
28177    136
28178    136
28179    136
28180    136
28181    136
28182    136
28183    442
28184    442
28185    442
28186    620
28187    620
28188    620
28189    620
28190    620
28191    620
28192    620
28193    620
28194    620
28195    620
28196    620
28197    620
28198    620
28199    620
28200    620
28201    108
28202    108
28203    108
Name: phenotype, Length: 811, dtype: object

Unnamed: 0,sequence,RGN,64_avg_hidden,64_final_hidden,64_final_cell,256_avg_hidden,256_final_cell,avg_hidden,final_hidden,final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
ag_ari,0.04486,0.429791,0.340007,0.218473,0.244331,0.51164,0.223316,0.628294,0.511814,0.221697,...,0.605307,0.238444,0.436609,0.204775,0.159567,0.451528,0.043336,0.035045,0.081889,0.299176
ag_fmi,0.189096,0.455848,0.353363,0.251591,0.25778,0.532204,0.305762,0.643243,0.543605,0.311505,...,0.61943,0.3223,0.461414,0.301568,0.176961,0.462657,0.186897,0.176453,0.172462,0.336265
ag_ami,0.182135,0.512596,0.415784,0.322424,0.345796,0.471641,0.378968,0.579816,0.540769,0.435509,...,0.569639,0.436607,0.519116,0.424276,0.270995,0.486758,0.170448,0.136768,0.347421,0.55746


In [7]:
ox_results

Unnamed: 0,sequence,RGN,64_avg_hidden,64_final_hidden,64_final_cell,256_avg_hidden,256_final_cell,avg_hidden,final_hidden,final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
ag_ari,0.04486,0.429791,0.340007,0.218473,0.244331,0.51164,0.223316,0.628294,0.511814,0.221697,...,0.605307,0.238444,0.436609,0.204775,0.159567,0.451528,0.043336,0.035045,0.081889,0.299176
ag_fmi,0.189096,0.455848,0.353363,0.251591,0.25778,0.532204,0.305762,0.643243,0.543605,0.311505,...,0.61943,0.3223,0.461414,0.301568,0.176961,0.462657,0.186897,0.176453,0.172462,0.336265
ag_ami,0.182135,0.512596,0.415784,0.322424,0.345796,0.471641,0.378968,0.579816,0.540769,0.435509,...,0.569639,0.436607,0.519116,0.424276,0.270995,0.486758,0.170448,0.136768,0.347421,0.55746


In [8]:
to_plot = ox_results.loc["ag_ari", :]
to_plot.index
print("Oxbench Performance, Adjusted Rand Index")
to_plot.sort_values(ascending=False)

Oxbench Performance, Adjusted Rand Index


avg_hidden              0.628294
all_avg_hidden          0.605307
arnold_original_3_7     0.599779
arnold_scrambled_3_5    0.528969
final_hidden            0.511814
256_avg_hidden          0.511640
simple_freq_and_len     0.451528
RGN_avg_hidden          0.436609
RGN                     0.429791
64_avg_hidden           0.340007
tfidf_3grams            0.299176
arnold_uniform_4_1      0.290734
64_final_cell           0.244331
all_64                  0.239477
all_final_cell          0.238444
all_256                 0.223316
256_final_cell          0.223316
all_1900                0.221976
final_cell              0.221697
64_final_hidden         0.218473
arnold_random_3_7       0.212805
RGN_final_cell          0.204775
simple_freq_plus        0.159567
tfidf_2grams            0.081889
sequence                0.044860
2grams                  0.043336
3grams                  0.035045
Name: ag_ari, dtype: float64

In [9]:
to_plot = ox_results.loc["ag_fmi", :]
to_plot.index
print("Oxbench Performance, Fowlkes Mallows Index")
to_plot.sort_values(ascending=False)

Oxbench Performance, Fowlkes Mallows Index


avg_hidden              0.643243
all_avg_hidden          0.619430
arnold_original_3_7     0.613676
arnold_scrambled_3_5    0.551390
final_hidden            0.543605
256_avg_hidden          0.532204
simple_freq_and_len     0.462657
RGN_avg_hidden          0.461414
RGN                     0.455848
64_avg_hidden           0.353363
tfidf_3grams            0.336265
all_final_cell          0.322300
arnold_uniform_4_1      0.321805
all_1900                0.311777
final_cell              0.311505
all_256                 0.305762
256_final_cell          0.305762
RGN_final_cell          0.301568
64_final_cell           0.257780
all_64                  0.253090
64_final_hidden         0.251591
arnold_random_3_7       0.231308
sequence                0.189096
2grams                  0.186897
simple_freq_plus        0.176961
3grams                  0.176453
tfidf_2grams            0.172462
Name: ag_fmi, dtype: float64

In [10]:
to_plot = ox_results.loc["ag_ami", :]
to_plot.index
print("Oxbench Performance, Adjusted Mutual-Information Score")
to_plot.sort_values(ascending=False)

Oxbench Performance, Adjusted Mutual-Information Score


avg_hidden              0.579816
arnold_original_3_7     0.575188
all_avg_hidden          0.569639
tfidf_3grams            0.557460
final_hidden            0.540769
arnold_scrambled_3_5    0.527545
RGN_avg_hidden          0.519116
RGN                     0.512596
simple_freq_and_len     0.486758
256_avg_hidden          0.471641
all_1900                0.437468
all_final_cell          0.436607
final_cell              0.435509
RGN_final_cell          0.424276
64_avg_hidden           0.415784
arnold_random_3_7       0.389157
all_256                 0.378968
256_final_cell          0.378968
tfidf_2grams            0.347421
64_final_cell           0.345796
all_64                  0.342001
arnold_uniform_4_1      0.338337
64_final_hidden         0.322424
simple_freq_plus        0.270995
sequence                0.182135
2grams                  0.170448
3grams                  0.136768
Name: ag_ami, dtype: float64

In [54]:
ox_results = ox_results.T
ox_results.columns = ["Adjusted Rand Index", "Fowlkes Mallows Index", "Adjusted Mutual Information"]

In [60]:
# Save results for later
# ox_results.to_csv("../../data/oxbench_agglom_results.csv")

In [63]:
display(ox_results.sort_values(by="Fowlkes Mallows Index", ascending=False))

Unnamed: 0,Adjusted Rand Index,Fowlkes Mallows Index,Adjusted Mutual Information
avg_hidden,0.628294,0.643243,0.579816
all_avg_hidden,0.605307,0.61943,0.569639
arnold_original_3_7,0.599779,0.613676,0.575188
arnold_scrambled_3_5,0.528969,0.55139,0.527545
final_hidden,0.511814,0.543605,0.540769
256_avg_hidden,0.51164,0.532204,0.471641
simple_freq_and_len,0.451528,0.462657,0.486758
RGN_avg_hidden,0.436609,0.461414,0.519116
RGN,0.429791,0.455848,0.512596
64_avg_hidden,0.340007,0.353363,0.415784


## Cluster Homstrad but include a hierarchical clustering figure with leaf labels

In [39]:
# First I need to get unique indices for all family names
display(homstrad)

Unnamed: 0,sequence,phenotype,is_train,is_test,phenotype_name,dataset,RGN,64_avg_hidden,64_final_hidden,64_final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
20668,TKIVKVTGDYALLEFKDDLTGKGSICAETTAILKYLSEKGIKTHLV...,SAICAR_synt,True,True,family_name,homstrad,"[-0.025834012776613232, -0.05399760976433755, ...","[-0.11867085099220274, 0.11209627985954283, -0...","[-0.11023015528917313, 0.061099682003259666, -...","[-0.16718728840351105, 0.4113061130046845, -0....",...,"[-0.11867085099220274, 0.11209627985954283, -0...","[-0.16718728840351105, 0.4113061130046845, -0....","[-0.025834012776613232, -0.05399760976433755, ...","[-0.025834012776613232, -0.05399760976433755, ...","[0.05263157894736842, 0.014354066985645933, 0....","[0.05263157894736842, 0.014354066985645933, 0....","[1, 0, 0, 3, 0, 1, 0, 1, 0, 2, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04026548332025606, 0.0, 0.0, 0.126771468660...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20669,SITKTELDGILPLVARGKVRDIYEVDAGTLLFVATDRISAYDVIME...,SAICAR_synt,True,True,family_name,homstrad,"[0.13893805444240567, -0.00019246361625846478,...","[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.048800613731145866, 0.09936266392469406, -...","[-0.08486960828304291, 0.5393546223640442, -0....",...,"[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.08486960828304291, 0.5393546223640442, -0....","[0.13893805444240567, -0.00019246361625846478,...","[0.13893805444240567, -0.00019246361625846478,...","[0.06040268456375839, 0.003355704697986577, 0....","[0.06040268456375839, 0.003355704697986577, 0....","[1, 0, 1, 2, 0, 1, 0, 0, 3, 0, 0, 1, 1, 1, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03242841044691856, 0.0, 0.03571512882338177...","[0.0, 0.0, 0.0, 0.04342385073884401, 0.0, 0.0,..."
20670,GVTVTSHREYLTQVNNSSGFVVNGGIVGNSLQLNPSNGTLFSWLPA...,bv,True,True,family_name,homstrad,"[-0.2107011675834656, 0.02743645198643208, -0....","[0.0143950954079628, 0.14230819046497345, -0.1...","[0.06821956485509872, 0.18811896443367004, -0....","[0.13774429261684418, 0.8134044408798218, -0.1...",...,"[0.0143950954079628, 0.14230819046497345, -0.1...","[0.13774429261684418, 0.8134044408798218, -0.1...","[-0.2107011675834656, 0.02743645198643208, -0....","[-0.2107011675834656, 0.02743645198643208, -0....","[0.08139534883720931, 0.011627906976744186, 0....","[0.08139534883720931, 0.011627906976744186, 0....","[0, 0, 2, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.10974000969089705, 0.052284840727...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20671,SSMDVTILSHCELSTELAVTVTIVVTSELVMPFTVGTWLRGVAQNW...,bv,True,True,family_name,homstrad,"[-0.1806732267141342, 0.039818041026592255, -0...","[0.011167618446052073, 0.1276816874742508, -0....","[0.102280355989933, 0.08621235936880113, -0.15...","[0.1770242303609848, 0.5491780638694763, -0.18...",...,"[0.011167618446052073, 0.1276816874742508, -0....","[0.1770242303609848, 0.5491780638694763, -0.18...","[-0.1806732267141342, 0.039818041026592255, -0...","[-0.1806732267141342, 0.039818041026592255, -0...","[0.09547738693467336, 0.020100502512562814, 0....","[0.09547738693467336, 0.020100502512562814, 0....","[2, 0, 1, 0, 0, 0, 0, 3, 0, 2, 1, 1, 0, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.08210906485157748, 0.0, 0.04521553459337313...","[0.04794958851519534, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20672,GAITVLHCELTAEIGVTDSIVVSSELVMPYTVGTWLRGVADNWSKY...,bv,True,True,family_name,homstrad,"[-0.1597319394350052, 0.06173299625515938, -0....","[0.0020451024174690247, 0.1371651589870453, -0...","[0.11017193645238876, 0.1414894163608551, -0.1...","[0.1897858828306198, 0.6649361252784729, -0.17...",...,"[0.0020451024174690247, 0.1371651589870453, -0...","[0.1897858828306198, 0.6649361252784729, -0.17...","[-0.1597319394350052, 0.06173299625515938, -0....","[-0.1597319394350052, 0.06173299625515938, -0....","[0.09183673469387756, 0.02040816326530612, 0.0...","[0.09183673469387756, 0.02040816326530612, 0.0...","[1, 0, 3, 1, 0, 3, 0, 2, 0, 2, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04108234370949983, 0.0, 0.13573849390672096...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.048856888589355524..."
20673,AAATSLVYDTCYVTLTERATTSFQRQSFPTLKGMGDRAFQVVAFTI...,bv,True,True,family_name,homstrad,"[-0.053042154759168625, -0.06021036952733993, ...","[0.049401644617319114, 0.14267222583293915, -0...","[0.12890928983688354, 0.1764889657497406, -0.0...","[0.22785061597824094, 0.6632839441299438, -0.0...",...,"[0.049401644617319114, 0.14267222583293915, -0...","[0.22785061597824094, 0.6632839441299438, -0.0...","[-0.053042154759168625, -0.06021036952733993, ...","[-0.053042154759168625, -0.06021036952733993, ...","[0.10638297872340426, 0.014184397163120567, 0....","[0.10638297872340426, 0.014184397163120567, 0....","[3, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 3, 0, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.15765389725018375, 0.0, 0.0, 0.055150671786...","[0.05773102713212317, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20674,SISQQTVWNQMATVRTPLNFDSSKQSFCQFSVDLLGGGISVDKTGD...,bv,True,True,family_name,homstrad,"[-0.1995099782943725, 0.05840854719281197, -0....","[0.014398585073649885, 0.09075652807950974, -0...","[0.07805189490318297, 0.061298828572034836, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...",...,"[0.014398585073649885, 0.09075652807950974, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...","[-0.1995099782943725, 0.05840854719281197, -0....","[-0.1995099782943725, 0.05840854719281197, -0....","[0.05405405405405406, 0.021621621621621623, 0....","[0.05405405405405406, 0.021621621621621623, 0....","[2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.08568200526443001, 0.0746405865590282, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20675,KAIKAWTGYSVSKWTASCAAAEAKVTSAITISLPNELSSERNKQLK...,bv,True,True,family_name,homstrad,"[-0.06763646006584167, 0.10793761163949966, 0....","[-0.036410845816135406, 0.10667729377746582, -...","[0.018149545416235924, 0.20563025772571564, -0...","[0.047857824712991714, 0.8198820352554321, -0....",...,"[-0.036410845816135406, 0.10667729377746582, -...","[0.047857824712991714, 0.8198820352554321, -0....","[-0.06763646006584167, 0.10793761163949966, 0....","[-0.06763646006584167, 0.10793761163949966, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[7, 0, 2, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, ...","[2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0.3652303532054341, 0.0, 0.1149278755014895, ...","[0.1163332517506968, 0.0, 0.06655393273580669,..."
20676,TMRAVKRMINTHLEHKRFALINSGNTNATAGTVQNLSNGIIQGDDI...,bv,True,True,family_name,homstrad,"[-0.1897951513528824, -0.028467625379562374, -...","[-0.0044975951313972464, 0.1004331111907959, -...","[0.01018504239618778, 0.1202806606888771, -0.1...","[0.016981407999992367, 0.625932514667511, -0.1...",...,"[-0.0044975951313972464, 0.1004331111907959, -...","[0.016981407999992367, 0.625932514667511, -0.1...","[-0.1897951513528824, -0.028467625379562374, -...","[-0.1897951513528824, -0.028467625379562374, -...","[0.07065217391304347, 0.005434782608695652, 0....","[0.07065217391304347, 0.005434782608695652, 0....","[1, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.043856354023303804, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20677,KPKLLYCSNGGYFLRILPDGTVDGTKDRSDQHIQLQLAAESIGEVY...,intb,True,True,family_name,homstrad,"[0.18745867908000946, -0.2971647083759308, -0....","[-0.049511555582284934, 0.10908841341733932, -...","[-0.07744679600000381, 0.06857240200042725, -0...","[-0.21872881054878235, 0.2799902558326721, -0....",...,"[-0.049511555582284934, 0.10908841341733932, -...","[-0.21872881054878235, 0.2799902558326721, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.03937007874015748, 0.015748031496062992, 0....","[0.03937007874015748, 0.015748031496062992, 0....","[1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.05639104338020663, 0.0, 0.0, 0.118360686773...","[0.0, 0.0, 0.0, 0.06495775619934394, 0.0, 0.0,..."


In [40]:
filtered_homstrad = homstrad[np.isnan(np.asarray(homstrad.RGN.values.tolist())).sum(axis=1) == 0]

print(filtered_homstrad.shape)
print(homstrad.shape)
encoder = LabelEncoder()
encoder.fit(filtered_homstrad['phenotype'])
labels = encoder.transform(filtered_homstrad['phenotype'])
print(labels)
print(encoder.inverse_transform(labels))

(3450, 32)
(3450, 32)
[566 566 726 ...,  76  76  76]
['SAICAR_synt' 'SAICAR_synt' 'bv' ..., 'Band_41_N' 'Band_41_N' 'Band_41_N']


In [41]:
# Nice! Now cluster with scores (later I will agglo cluster with dendro)
column_idxs = list(range(len(filtered_homstrad.columns)))
X = filtered_homstrad.iloc[:,[0] + column_idxs[6:]]
display(X)
display(labels) # from previous cell
hom_results = cluster(X, labels, n_clusters=len(np.unique(labels)))

Unnamed: 0,sequence,RGN,64_avg_hidden,64_final_hidden,64_final_cell,256_avg_hidden,256_final_cell,avg_hidden,final_hidden,final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
20668,TKIVKVTGDYALLEFKDDLTGKGSICAETTAILKYLSEKGIKTHLV...,"[-0.025834012776613232, -0.05399760976433755, ...","[-0.11867085099220274, 0.11209627985954283, -0...","[-0.11023015528917313, 0.061099682003259666, -...","[-0.16718728840351105, 0.4113061130046845, -0....","[-0.03689621016383171, 0.005488051101565361, 0...","[-0.3796911239624024, 0.3665743768215179, 0.75...","[0.0016311679501086473, -0.003366485703736544,...","[0.0008543662261217833, -0.02976409159600735, ...","[8.34224796295166, -1.7624926567077637, 22.235...",...,"[-0.11867085099220274, 0.11209627985954283, -0...","[-0.16718728840351105, 0.4113061130046845, -0....","[-0.025834012776613232, -0.05399760976433755, ...","[-0.025834012776613232, -0.05399760976433755, ...","[0.05263157894736842, 0.014354066985645933, 0....","[0.05263157894736842, 0.014354066985645933, 0....","[1, 0, 0, 3, 0, 1, 0, 1, 0, 2, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04026548332025606, 0.0, 0.0, 0.126771468660...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20669,SITKTELDGILPLVARGKVRDIYEVDAGTLLFVATDRISAYDVIME...,"[0.13893805444240567, -0.00019246361625846478,...","[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.048800613731145866, 0.09936266392469406, -...","[-0.08486960828304291, 0.5393546223640442, -0....","[-0.0632651224732399, 0.0060843233950436115, 0...","[-0.62327641248703, 0.22831948101520536, 0.795...","[0.004859112203121185, -0.030498003587126725, ...","[0.007539444137364627, 0.11841168999671935, 0....","[2.0707240104675293, 0.7589344978332521, 17.26...",...,"[-0.0845603421330452, 0.1088557317852974, -0.1...","[-0.08486960828304291, 0.5393546223640442, -0....","[0.13893805444240567, -0.00019246361625846478,...","[0.13893805444240567, -0.00019246361625846478,...","[0.06040268456375839, 0.003355704697986577, 0....","[0.06040268456375839, 0.003355704697986577, 0....","[1, 0, 1, 2, 0, 1, 0, 0, 3, 0, 0, 1, 1, 1, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03242841044691856, 0.0, 0.03571512882338177...","[0.0, 0.0, 0.0, 0.04342385073884401, 0.0, 0.0,..."
20670,GVTVTSHREYLTQVNNSSGFVVNGGIVGNSLQLNPSNGTLFSWLPA...,"[-0.2107011675834656, 0.02743645198643208, -0....","[0.0143950954079628, 0.14230819046497345, -0.1...","[0.06821956485509872, 0.18811896443367004, -0....","[0.13774429261684418, 0.8134044408798218, -0.1...","[-0.010897865518927574, -0.0025827903300523762...","[0.03779356926679611, -0.25836318731307983, 0....","[0.006230797152966261, 0.07900305837392807, 0....","[0.002685799030587077, 0.11052990704774857, 0....","[11.949387550354006, 2.0751357078552246, 8.230...",...,"[0.0143950954079628, 0.14230819046497345, -0.1...","[0.13774429261684418, 0.8134044408798218, -0.1...","[-0.2107011675834656, 0.02743645198643208, -0....","[-0.2107011675834656, 0.02743645198643208, -0....","[0.08139534883720931, 0.011627906976744186, 0....","[0.08139534883720931, 0.011627906976744186, 0....","[0, 0, 2, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.10974000969089705, 0.052284840727...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20671,SSMDVTILSHCELSTELAVTVTIVVTSELVMPFTVGTWLRGVAQNW...,"[-0.1806732267141342, 0.039818041026592255, -0...","[0.011167618446052073, 0.1276816874742508, -0....","[0.102280355989933, 0.08621235936880113, -0.15...","[0.1770242303609848, 0.5491780638694763, -0.18...","[0.06157157942652702, -0.007944758981466293, 0...","[-0.3768338561058045, -0.2964747250080109, 0.6...","[0.004863474518060684, -0.037296663969755166, ...","[2.1357433070079423e-05, 0.00845964904874563, ...","[13.65040111541748, 1.1373584270477295, 9.2782...",...,"[0.011167618446052073, 0.1276816874742508, -0....","[0.1770242303609848, 0.5491780638694763, -0.18...","[-0.1806732267141342, 0.039818041026592255, -0...","[-0.1806732267141342, 0.039818041026592255, -0...","[0.09547738693467336, 0.020100502512562814, 0....","[0.09547738693467336, 0.020100502512562814, 0....","[2, 0, 1, 0, 0, 0, 0, 3, 0, 2, 1, 1, 0, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0.08210906485157748, 0.0, 0.04521553459337313...","[0.04794958851519534, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20672,GAITVLHCELTAEIGVTDSIVVSSELVMPYTVGTWLRGVADNWSKY...,"[-0.1597319394350052, 0.06173299625515938, -0....","[0.0020451024174690247, 0.1371651589870453, -0...","[0.11017193645238876, 0.1414894163608551, -0.1...","[0.1897858828306198, 0.6649361252784729, -0.17...","[0.023780900985002518, -0.008387896232306956, ...","[-0.1416822373867035, -0.5437777042388916, 0.8...","[0.005245750304311514, -0.0275524128228426, 0....","[6.858454435132444e-05, -0.010838385671377182,...","[5.885132789611816, -0.10741066932678224, 8.72...",...,"[0.0020451024174690247, 0.1371651589870453, -0...","[0.1897858828306198, 0.6649361252784729, -0.17...","[-0.1597319394350052, 0.06173299625515938, -0....","[-0.1597319394350052, 0.06173299625515938, -0....","[0.09183673469387756, 0.02040816326530612, 0.0...","[0.09183673469387756, 0.02040816326530612, 0.0...","[1, 0, 3, 1, 0, 3, 0, 2, 0, 2, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04108234370949983, 0.0, 0.13573849390672096...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.048856888589355524..."
20673,AAATSLVYDTCYVTLTERATTSFQRQSFPTLKGMGDRAFQVVAFTI...,"[-0.053042154759168625, -0.06021036952733993, ...","[0.049401644617319114, 0.14267222583293915, -0...","[0.12890928983688354, 0.1764889657497406, -0.0...","[0.22785061597824094, 0.6632839441299438, -0.0...","[-0.030133364722132683, -0.0032529854215681553...","[0.29939159750938416, -0.03477983921766281, 0....","[0.007402515970170499, 0.0006386897293850781, ...","[0.0009415321401320399, 0.07091871649026871, 0...","[9.869279861450195, 0.9700164794921876, 10.263...",...,"[0.049401644617319114, 0.14267222583293915, -0...","[0.22785061597824094, 0.6632839441299438, -0.0...","[-0.053042154759168625, -0.06021036952733993, ...","[-0.053042154759168625, -0.06021036952733993, ...","[0.10638297872340426, 0.014184397163120567, 0....","[0.10638297872340426, 0.014184397163120567, 0....","[3, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 3, 0, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.15765389725018375, 0.0, 0.0, 0.055150671786...","[0.05773102713212317, 0.0, 0.0, 0.0, 0.0, 0.0,..."
20674,SISQQTVWNQMATVRTPLNFDSSKQSFCQFSVDLLGGGISVDKTGD...,"[-0.1995099782943725, 0.05840854719281197, -0....","[0.014398585073649885, 0.09075652807950974, -0...","[0.07805189490318297, 0.061298828572034836, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...","[-0.008131944574415684, 0.002119104377925396, ...","[0.013085480779409409, 0.012237253598868849, 0...","[0.005622528493404388, -0.008238513953983784, ...","[0.00010069691052194683, -0.000950562360230833...","[14.609713554382324, -0.03701789304614067, 8.7...",...,"[0.014398585073649885, 0.09075652807950974, -0...","[0.1526552140712738, 0.4351421296596527, -0.15...","[-0.1995099782943725, 0.05840854719281197, -0....","[-0.1995099782943725, 0.05840854719281197, -0....","[0.05405405405405406, 0.021621621621621623, 0....","[0.05405405405405406, 0.021621621621621623, 0....","[2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.08568200526443001, 0.0746405865590282, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20675,KAIKAWTGYSVSKWTASCAAAEAKVTSAITISLPNELSSERNKQLK...,"[-0.06763646006584167, 0.10793761163949966, 0....","[-0.036410845816135406, 0.10667729377746582, -...","[0.018149545416235924, 0.20563025772571564, -0...","[0.047857824712991714, 0.8198820352554321, -0....","[-0.025972777977585796, -0.005790079943835735,...","[-0.4857699275016785, 0.046344801783561713, 0....","[0.007855103351175785, -0.009758483618497849, ...","[0.000491130689624697, -0.020186007022857663, ...","[9.214288711547852, -0.4732394814491272, 4.776...",...,"[-0.036410845816135406, 0.10667729377746582, -...","[0.047857824712991714, 0.8198820352554321, -0....","[-0.06763646006584167, 0.10793761163949966, 0....","[-0.06763646006584167, 0.10793761163949966, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[0.14093959731543623, 0.013422818791946308, 0....","[7, 0, 2, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, ...","[2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0.3652303532054341, 0.0, 0.1149278755014895, ...","[0.1163332517506968, 0.0, 0.06655393273580669,..."
20676,TMRAVKRMINTHLEHKRFALINSGNTNATAGTVQNLSNGIIQGDDI...,"[-0.1897951513528824, -0.028467625379562374, -...","[-0.0044975951313972464, 0.1004331111907959, -...","[0.01018504239618778, 0.1202806606888771, -0.1...","[0.016981407999992367, 0.625932514667511, -0.1...","[-0.08871971070766449, 0.0008602088200859724, ...","[-0.22457635402679446, -0.03100283071398735, 0...","[0.006967336870729922, -0.04187467321753502, 0...","[0.0009202194050885736, 0.017549389973282814, ...","[11.948099136352539, 0.1619352102279663, 6.552...",...,"[-0.0044975951313972464, 0.1004331111907959, -...","[0.016981407999992367, 0.625932514667511, -0.1...","[-0.1897951513528824, -0.028467625379562374, -...","[-0.1897951513528824, -0.028467625379562374, -...","[0.07065217391304347, 0.005434782608695652, 0....","[0.07065217391304347, 0.005434782608695652, 0....","[1, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.043856354023303804, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20677,KPKLLYCSNGGYFLRILPDGTVDGTKDRSDQHIQLQLAAESIGEVY...,"[0.18745867908000946, -0.2971647083759308, -0....","[-0.049511555582284934, 0.10908841341733932, -...","[-0.07744679600000381, 0.06857240200042725, -0...","[-0.21872881054878235, 0.2799902558326721, -0....","[-0.002752342959865928, 0.004171363078057766, ...","[-0.4120322167873383, 0.19324880838394165, 0.4...","[0.005846815183758736, 0.028275568038225174, 0...","[0.00012055341358063745, -0.025633059442043304...","[5.7194252014160165, -0.6270264983177185, 12.9...",...,"[-0.049511555582284934, 0.10908841341733932, -...","[-0.21872881054878235, 0.2799902558326721, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.18745867908000946, -0.2971647083759308, -0....","[0.03937007874015748, 0.015748031496062992, 0....","[0.03937007874015748, 0.015748031496062992, 0....","[1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.05639104338020663, 0.0, 0.0, 0.118360686773...","[0.0, 0.0, 0.0, 0.06495775619934394, 0.0, 0.0,..."


array([566, 566, 726, ...,  76,  76,  76])

In [42]:
hom_results

Unnamed: 0,sequence,RGN,64_avg_hidden,64_final_hidden,64_final_cell,256_avg_hidden,256_final_cell,avg_hidden,final_hidden,final_cell,...,all_avg_hidden,all_final_cell,RGN_avg_hidden,RGN_final_cell,simple_freq_plus,simple_freq_and_len,2grams,3grams,tfidf_2grams,tfidf_3grams
ag_ari,0.003666,0.254722,0.105519,0.053002,0.059149,0.120556,0.044765,0.289274,0.073021,0.044791,...,0.264451,0.03709,0.258734,0.068085,0.025595,0.148812,0.002919,0.002239,0.008546,0.115079
ag_fmi,0.050649,0.31096,0.111209,0.066489,0.063566,0.131132,0.111343,0.318563,0.121485,0.126025,...,0.294128,0.112537,0.314435,0.157624,0.027115,0.172625,0.046382,0.04413,0.056543,0.180524
ag_ami,0.109055,0.461505,0.199143,0.128225,0.121975,0.226122,0.242811,0.438337,0.293113,0.30672,...,0.437373,0.28075,0.463626,0.344164,0.052753,0.290168,0.085823,0.06197,0.199435,0.461597


In [43]:
to_plot = hom_results.loc["ag_ari", :]
to_plot.index
print("Homstrad Performance, Adjusted Rand Index")
to_plot.sort_values(ascending=False)

Homstrad Performance, Adjusted Rand Index


avg_hidden              0.289274
all_avg_hidden          0.264451
RGN_avg_hidden          0.258734
RGN                     0.254722
arnold_original_3_7     0.155078
simple_freq_and_len     0.148812
256_avg_hidden          0.120556
tfidf_3grams            0.115079
arnold_scrambled_3_5    0.108750
64_avg_hidden           0.105519
final_hidden            0.073021
RGN_final_cell          0.068085
all_64                  0.060362
64_final_cell           0.059149
arnold_uniform_4_1      0.056007
64_final_hidden         0.053002
all_1900                0.050733
final_cell              0.044791
all_256                 0.044765
256_final_cell          0.044765
arnold_random_3_7       0.041358
all_final_cell          0.037090
simple_freq_plus        0.025595
tfidf_2grams            0.008546
sequence                0.003666
2grams                  0.002919
3grams                  0.002239
Name: ag_ari, dtype: float64

In [44]:
to_plot = hom_results.loc["ag_fmi", :]
to_plot.index
print("Homstrad Performance, Fowlkes Mallows Index")
to_plot.sort_values(ascending=False)

Homstrad Performance, Fowlkes Mallows Index


avg_hidden              0.318563
RGN_avg_hidden          0.314435
RGN                     0.310960
all_avg_hidden          0.294128
arnold_original_3_7     0.197837
tfidf_3grams            0.180524
simple_freq_and_len     0.172625
RGN_final_cell          0.157624
arnold_scrambled_3_5    0.149252
all_1900                0.133753
256_avg_hidden          0.131132
final_cell              0.126025
final_hidden            0.121485
all_final_cell          0.112537
all_256                 0.111343
256_final_cell          0.111343
64_avg_hidden           0.111209
64_final_hidden         0.066489
all_64                  0.064168
64_final_cell           0.063566
arnold_random_3_7       0.063450
arnold_uniform_4_1      0.060210
tfidf_2grams            0.056543
sequence                0.050649
2grams                  0.046382
3grams                  0.044130
simple_freq_plus        0.027115
Name: ag_fmi, dtype: float64

In [45]:
to_plot = hom_results.loc["ag_ami", :]
to_plot.index
print("Homstrad Performance, Adjusted Mutual Information Score")
to_plot.sort_values(ascending=False)

Homstrad Performance, Adjusted Mutual Information Score


RGN_avg_hidden          0.463626
tfidf_3grams            0.461597
RGN                     0.461505
avg_hidden              0.438337
all_avg_hidden          0.437373
arnold_original_3_7     0.354528
RGN_final_cell          0.344164
all_1900                0.312909
arnold_scrambled_3_5    0.309349
final_cell              0.306720
final_hidden            0.293113
simple_freq_and_len     0.290168
all_final_cell          0.280750
all_256                 0.242811
256_final_cell          0.242811
256_avg_hidden          0.226122
tfidf_2grams            0.199435
64_avg_hidden           0.199143
arnold_random_3_7       0.178645
64_final_hidden         0.128225
64_final_cell           0.121975
all_64                  0.121943
sequence                0.109055
arnold_uniform_4_1      0.088395
2grams                  0.085823
3grams                  0.061970
simple_freq_plus        0.052753
Name: ag_ami, dtype: float64

In [52]:
hom_results = hom_results.T
hom_results.columns = ["Adjusted Rand Index", "Fowlkes Mallows Index", "Adjusted Mutual Information"]

In [59]:
# Save results for later
# hom_results.to_csv("../../data/homstrad_agglom_results.csv")

In [64]:
hom_results.sort_values(by="Fowlkes Mallows Index", ascending=False)

Unnamed: 0,Adjusted Rand Index,Fowlkes Mallows Index,Adjusted Mutual Information
avg_hidden,0.289274,0.318563,0.438337
RGN_avg_hidden,0.258734,0.314435,0.463626
RGN,0.254722,0.31096,0.461505
all_avg_hidden,0.264451,0.294128,0.437373
arnold_original_3_7,0.155078,0.197837,0.354528
tfidf_3grams,0.115079,0.180524,0.461597
simple_freq_and_len,0.148812,0.172625,0.290168
RGN_final_cell,0.068085,0.157624,0.344164
arnold_scrambled_3_5,0.10875,0.149252,0.309349
all_1900,0.050733,0.133753,0.312909
