In [93]:
from sourmash import signature
import glob
import os
from collections import Counter
import pandas as pd

In [30]:
files = glob.glob("gather_genome_129_sigs/*")

In [31]:
# read in 129 genome sig minhashes as a dictionary
genome_dict = {}
for file in files:
    if os.path.getsize(file) > 0:
        sigfp = open(file, 'rt')
        siglist = list(signature.load_signatures(sigfp))
        loaded_sig = siglist[1] # get k31 (contains k21, k31, k51)
        mins = loaded_sig.minhash.get_mins() # Get the minhashes 
        genome_dict[file] = mins

In [46]:
# read in vita variables 
sigfp = open("vita_vars.sig", 'rt')
siglist = list(signature.load_signatures(sigfp))
vita_vars = siglist[0]
vita_vars = vita_vars.minhash.get_mins() 

In [55]:
len(vita_vars)

14480

In [58]:
# generate a list of all minhashes
all_mins = []
for file in files:
    if os.path.getsize(file) > 0:
        sigfp = open(file, 'rt')
        siglist = list(signature.load_signatures(sigfp))
        loaded_sig = siglist[1]
        mins = loaded_sig.minhash.get_mins() # Get the minhashes 
        all_mins += mins

In [60]:
# get intersection size between hashes in vita and 
# hashes in genomes
len(list(set(vita_vars) & set(all_mins)))

9289

In [61]:
len(genome_dict.keys()) 

128

In [69]:
# print vita hashes that are in genomes
list(set(vita_vars) & set(all_mins))

[3316713268445188,
 6734077454581764,
 6082520873828358,
 7024875571216392,
 4444753960730636,
 5525448790278156,
 6940234475274260,
 6632317837049878,
 7548582939918359,
 8019402136846362,
 2405315581575195,
 139712954892316,
 7252673601863714,
 6289379367059494,
 1313724492513328,
 7388450051555385,
 4032473246564410,
 2686312140800064,
 5690608482320458,
 53670885195851,
 2711787525079128,
 6216862491181151,
 3783942704496740,
 3741157294014571,
 2035332620812399,
 516166688735346,
 539971830579315,
 555103634948210,
 8289102343831667,
 7926211603366017,
 113839954854019,
 8318296508661891,
 3658623152357509,
 4412549910986891,
 7979817877045396,
 4076971827429526,
 3129734693847191,
 3787146443260056,
 2855822125564061,
 8638117119426720,
 5288516795760810,
 1812844153143468,
 7426845867278512,
 1584355457040564,
 4701839541174457,
 5660339421085883,
 2032250414858428,
 252876727255234,
 8554338927313097,
 8947025062363338,
 6769785775718605,
 8669002711335122,
 429061405442259,
 6

In [79]:
for i in genome_dict.values():
    if 3316713268445188 in i:
        print("value in dictionary")
        break


value in dictionary


In [87]:
def get_all_keys_if_value(dictionary, hash_query):
    genomes = list()
    for genome, v in dictionary.items():
        if hash_query in v:
            genomes.append(genome)
    return genomes

In [88]:
get_all_keys_if_value(genome_dict, 3316713268445188)

['gather_genome_129_sigs/GCA_000153905.1_ASM15390v1_genomic.fna.sig']

In [91]:
# create a dictionary where each vita_vars hash is a key, 
# and values are the genome signatures in which that hash
# is contained
vita_hash_dict = {}
for hashy in vita_vars:
    keys = get_all_keys_if_value(genome_dict, hashy)
    vita_hash_dict[hashy] = keys

In [94]:
hash_df = pd.DataFrame(vita_hash_dict.items(), columns=['hash', 'genome'])

In [120]:
df = pd.DataFrame(list(vita_hash_dict.values()), index = vita_hash_dict.keys())
df = df.reset_index()

In [121]:
df = pd.melt(df, id_vars=['index'], var_name= "drop", value_name='genome')

In [122]:
df = df.drop('drop', 1)

In [123]:
df

Unnamed: 0,index,genome
0,461695923869,gather_genome_129_sigs/ERS473343_4.fna.sig
1,1281144948155,gather_genome_129_sigs/SRS475589_63.fna.sig
2,1907985797945,gather_genome_129_sigs/SRS142599_2.fna.sig
3,3480483852161,gather_genome_129_sigs/GCA_900036035.1_RGNV359...
4,4233332743542,gather_genome_129_sigs/KarlssonFH_2013__S434__...
...,...,...
434395,9219848034543360,
434396,9221521875269736,
434397,9221590808054388,
434398,9221638331043008,


In [125]:
df = df.drop_duplicates()

In [128]:
df.to_csv("vita_hash_to_129_genome_mapping.csv", index = False)