In [3]:
# 1. load relevant packages

from collections import defaultdict
import pandas as pd
import json
import gzip
import os
import numpy
from time import time

In [4]:
model_names_list = [
    fp
    for fp in os.listdir(".")
    if fp.endswith(".json.gz")
]
model_names_list=model_names_list
print("There are {:,} models in this folder".format(len(model_names_list)))

There are 200 models in this folder


In [5]:
# !ls

In [6]:
#  model_names_list = ["groupA.50.0.1.CAGs.json.gz", "groupA.50.0.2.CAGs.json.gz",
#                     "groupA.50.0.3.CAGs.json.gz","groupA.50.0.4.CAGs.json.gz",
#                     "groupA.60.0.1.CAGs.json.gz","groupA.60.0.2.CAGs.json.gz","groupJ.90.0.4.CAGs.json.gz"]

In [7]:
# 2. read in file and convert into sets of CAGs

def read_json_gz(fp):
    return json.load(gzip.open(fp, "rt"))

def make_list_genes(fp, min_group_size=2):
    model = read_json_gz(fp)

    list_of_sets = []
    for list_of_genes in model.values():
        if len(list_of_genes) >= min_group_size:
            list_of_sets.append(set(list_of_genes))

    return list_of_sets

def make_indexed_list_of_genes(fp, min_group_size=2):
    model = read_json_gz(fp)

    # Store as a dictionary. Keys are gene names, values are sets of all genes in the CAG
    indexed_list_of_sets = {}

    for list_of_genes in model.values():
        if len(list_of_genes) >= min_group_size:
        
            for gene_name in list_of_genes:
             
                indexed_list_of_sets[gene_name] = set(list_of_genes)

    return indexed_list_of_sets




In [10]:
# 4. function to compare set A to set B

def one_to_one(set_A,set_B):
    
    if set_B is None:
        return 0

    ov = set(set_A) & set(set_B)
    length=len(ov)
#     print("This is new one_to_one")
    return length / len(set_A)

# 5. function to compare set A to list of set B, returns max # of gene overlap

def one_to_many(set_A, indexed_list_of_sets_B):
    max_score=0
    
    for gene_name in set_A:
        score=one_to_one(set_A,indexed_list_of_sets_B.get(gene_name))

        if score>max_score:
            max_score=score

    return max_score * len(set_A)

# 6. function to compare list of sets A to list of sets B

def many_to_many(index_of_sets_A, index_of_sets_B):
    
    scores=[]
    total_number_of_genes = []
    
    # Keep track of what genes have been already queried
    already_queried_genes = set([])
    
    # Iterate over the list of genes in each CAG
    for gene_list in index_of_sets_A.values():
        if list(gene_list)[0] in already_queried_genes:
            continue

        scores.append(one_to_many(gene_list, index_of_sets_B))

        total_number_of_genes.append(len(gene_list))

        already_queried_genes |= gene_list

    return sum(scores)/sum(total_number_of_genes)

# 7. function to find number of genes in model 1
def number_genes(model1):
    total_genes=0
    for cag in model1.values():
        total_genes += len(cag)

    return total_genes

many_to_many(
    make_indexed_list_of_genes("groupA.50.0.1.CAGs.json.gz"),
    make_indexed_list_of_genes("groupA.50.0.2.CAGs.json.gz")
)

0.947529077416978

In [None]:
# 8. function to create dataframe

def comparison_df(fp_list):
    comparison_dataframe = []
    
    for fp_A in fp_list:
        
        modelA_name = fp_A.replace(".CAGs.json.gz", "")
        
        for fp_B in fp_list:
            
            start_time = time()

            modelB_name = fp_B.replace(".CAGs.json.gz", "")

            if modelA_name == modelB_name:
                continue
                
            cache_fp = "{}.{}.cache.json".format(modelA_name, modelB_name)
            
            if os.path.exists(cache_fp):
                print("Retriving results for {} and {} from the cache".format(modelA_name, modelB_name))
                res = json.load(open(cache_fp, "rt"))
                
            else:
                modelA = make_indexed_list_of_genes(fp_A)
                modelB = make_indexed_list_of_genes(fp_B)

                res = {
                    "Model A": modelA_name,
                    "Model B": modelB_name,
                    "Score": many_to_many(modelA,modelB),
                    "Number of CAGs in Model A":(len(modelA)),
                    "Number of Genes in Model A": number_genes(modelA)                    
                }
                print("Writing results for {} and {} to the cache".format(modelA_name, modelB_name))

                json.dump(res, open(cache_fp, "wt"))

            comparison_dataframe.append(res)
            print("It took {:,} seconds to compare {} to {}".format(
                round(time() - start_time, 2),
                modelA_name,
                modelB_name
            ))
        
            
    comparison_dataframe = pd.DataFrame(comparison_dataframe)
    return comparison_dataframe

df = comparison_df(model_names_list[:3])

Retriving results for groupB.70.0.2 and groupC.60.0.3 from the cache
It took 0.0 seconds to compare groupB.70.0.2 to groupC.60.0.3


In [None]:
# 10. creating dataframe of models
df = comparison_df(models)

In [None]:
df