In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import ujson
import sys
import os

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict

from bigbio.dataloader import BigBioConfigHelpers
from bigbio.utils.constants import Tasks


sys.path.append('..')
from bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES
from bigbio_utils import dataset_to_documents, dataset_to_df, resolve_abbreviation

conhelps = BigBioConfigHelpers()

pd.set_option('display.max_rows', 200)


In [6]:
def get_overlap(df, dataset_name=None, plot=False):
    """
    Overlap between train mentions and test mentions.

    Return both the % of entities that overlap and the % of mentions with eneities that appear in train set (since some entities appear more than once)
    """
    # Get unique entities in data splits
    train_df = df.query('split == "train"')
    test_df = df.query('split == "test"')
    train_valid_df = df.query('split != "test"')
    unique_train = set(train_df.db_id.unique())
    unique_test = set(test_df.db_id.unique())
    unique_train_valid = set(train_valid_df.db_id.unique())
    unique_ents = df.db_id.unique()

    # Get total number of unique entities by type
    ents_by_type = df.groupby("type").agg({"db_id": "nunique"}).to_dict()['db_id']
    mention_counts_by_type = df.groupby("type").agg({"db_id": "count"}).to_dict()['db_id']

    # Get distribution of entities
    if plot:
        test_df["in_train"] = test_df.db_id.isin(unique_train)
        curie_counts = (
            test_df.groupby("db_id")
            .agg({"mention_id": "count", "in_train": "first"})
            .rename({"mention_id": "cui_num_test_mentions"}, axis=1)
        )
        sns.displot(
            data=curie_counts, x="cui_num_test_mentions", hue="in_train", log_scale=True
        )
        if dataset_name is not None:
            plt.title(dataset_name)
        plt.show()

    # Get overlap of mentions
    train_test_ent_overlap = len(unique_test.intersection(unique_train)) / len(
        unique_test
    )
    test_ent_overlap = len(unique_test.intersection(unique_train_valid)) / len(
        unique_test
    )
    mention_overlap = test_df["db_id"].isin(unique_train).mean()
    return {
        "unique_ents": len(unique_ents),
        "ent_overlap": train_test_ent_overlap,
        "mention_overlap": mention_overlap,
        # "ent_counts_by_type": ents_by_type,
        # 'mention_counts_by_type': mention_counts_by_type,
        'unique_types': len(df.type.unique()),
        "total_documents": len(df.pmid.unique()),
        "train_documents": len(train_df.pmid.unique()),
        "test_documents": len(test_df.pmid.unique()),
        "total_mentions": df.shape[0],
        "train_mentions": train_df.shape[0],
        "test_mentions": test_df.shape[0],
        "has_validation_set": ("validation" in df.split.unique()),
    }


def dataset_to_df(dataset):
    '''
    Convert BigBio dataset to pandas DataFrame
    '''
    columns = [
        "pmid",
        "mention_id",
        "text",
        "type",
        "db_name",
        "db_id",
        "split",
    ]
    all_lines = []
    for split in dataset.keys():
        for doc in dataset[split]:
            pmid = doc["document_id"]
            for e in doc["entities"]:
                if len(e["normalized"]) == 0:
                    continue
                text = " ".join(e["text"])
                db_name = e["normalized"][0]["db_name"]
                db_id = e["normalized"][0]["db_id"]
                all_lines.append(
                    [pmid, e["id"], text, e["type"], db_name, db_id, split]
                )

    return pd.DataFrame(all_lines, columns=columns)



# Basic tests to see if functions work
# bc5cdr_df = dataset_to_df(bc5cdr_bigbio)
# get_overlap(bc5cdr_df)


In [9]:
all_stats = []
for dataset in tqdm(['medmentions_full','medmentions_st21pv','bc5cdr','gnormplus','ncbi_disease','nlmchem', 'nlm_gene']):
    data = conhelps.for_config_name(f"{dataset}_bigbio_kb").load_dataset()
    df = dataset_to_df(data)
    stats = get_overlap(df, dataset)
    stats['dataset'] = dataset
    all_stats.append(stats)

  0%|          | 0/7 [00:00<?, ?it/s]Reusing dataset med_mentions_dataset (/Users/david/.cache/huggingface/datasets/med_mentions_dataset/medmentions_full_bigbio_kb/1.0.0/b5c8691186d4701f9b18eddbe36d178ccf7e55761dcc6140c57f4410754511ac)
100%|██████████| 3/3 [00:00<00:00, 389.67it/s]
 14%|█▍        | 1/7 [00:06<00:40,  6.77s/it]Reusing dataset med_mentions_dataset (/Users/david/.cache/huggingface/datasets/med_mentions_dataset/medmentions_st21pv_bigbio_kb/1.0.0/b5c8691186d4701f9b18eddbe36d178ccf7e55761dcc6140c57f4410754511ac)
100%|██████████| 3/3 [00:00<00:00, 286.54it/s]
 29%|██▊       | 2/7 [00:10<00:25,  5.07s/it]Reusing dataset bc5cdr_dataset (/Users/david/.cache/huggingface/datasets/bc5cdr_dataset/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)
100%|██████████| 3/3 [00:00<00:00, 394.37it/s]
 43%|████▎     | 3/7 [00:11<00:12,  3.20s/it]Reusing dataset gnormplus_dataset (/Users/david/.cache/huggingface/datasets/gnormplus_dataset/gnormplus_bigbio

In [10]:
all_stats

[{'unique_ents': 34724,
  'ent_overlap': 0.6199371930107094,
  'mention_overlap': 0.8221204512041217,
  'unique_types': 127,
  'total_documents': 4392,
  'train_documents': 2635,
  'test_documents': 879,
  'total_mentions': 385098,
  'train_mentions': 230591,
  'test_mentions': 76861,
  'has_validation_set': True,
  'dataset': 'medmentions_full'},
 {'unique_ents': 25419,
  'ent_overlap': 0.5754995861416577,
  'mention_overlap': 0.7740867096645666,
  'unique_types': 21,
  'total_documents': 4392,
  'train_documents': 2635,
  'test_documents': 879,
  'total_mentions': 203282,
  'train_mentions': 122241,
  'test_mentions': 40157,
  'has_validation_set': True,
  'dataset': 'medmentions_st21pv'},
 {'unique_ents': 2348,
  'ent_overlap': 0.5300380228136882,
  'mention_overlap': 0.7733048693707432,
  'unique_types': 2,
  'total_documents': 1500,
  'train_documents': 500,
  'test_documents': 500,
  'total_mentions': 29044,
  'train_mentions': 9494,
  'test_mentions': 9837,
  'has_validation_set

In [11]:
dataset_to_doc_type = {
    "medmentions_full": "PubMed Abstracts",
    "medmentions_st21pv": "PubMed Abstracts",
    "bc5cdr": "PubMed Abstracts",
    "gnormplus": "PubMed Abstracts",
    "ncbi_disease": "PubMed Abstracts",
    "nlmchem": "PMC Full-Text",
    "craft": "PMC Full-Text",
    "bc6id": "PMC Figure Captions",
    "bc3gm": "PMC Full-Text",
    "plantnorm": "PubMed Abstracts",
    "nlm_gene": "PMC Full-Text",
}

dataset_to_ontology = {
    "medmentions_full": "UMLS",
    "medmentions_st21pv": "UMLS",
    "bc5cdr": "MeSH",
    "gnormplus": "Entrez",
    "ncbi_disease": "MeSH, OMIM",
    "nlmchem": "MeSH",
    "craft": "Many",
    "bc6id": "Many",
    "bc3gm": "Entrez",
    "plantnorm": "NCBI Taxonomy",
    'nlm_gene': 'Entrez',
}

dataset_to_pretty_name = {
    "medmentions_full": "MedMentions Full",
    "medmentions_st21pv": "MedMentions ST21PV",
    "bc5cdr": "BC5CDR",
    "gnormplus": "GNormPlus",
    "ncbi_disease": "NCBI Disease",
    "nlmchem": "NLM Chem",
    "craft": "CRAFT",
    "bc6id": "BC6ID",
    "bc3gm": "BC3GM",
    "plantnorm": "PlantNorm",
    'nlm_gene': "NLM Gene"
}

dataset_to_paper_url = {
    "medmentions_full": "[(Mohan and Li, 2019)](https://github.com/chanzuckerberg/MedMentions)",
    "medmentions_st21pv": "[(Mohan and Li, 2019)](https://github.com/chanzuckerberg/MedMentions)",
    "bc5cdr": "[(Li et al, 2016)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626)",
    "gnormplus": "[(Wei et al, 2016)](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/gnormplus)",
    "ncbi_disease": "[(Dogan et al, 2014)](https://www.sciencedirect.com/science/article/pii/S1532046413001974?via%3Dihub)",
    "nlmchem": "[(Islamaj et al, 2021)](https://www.nature.com/articles/s41597-021-00875-1)",
    "craft": "[(Bada et al, 2012)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3476437)",
    "bc6id": "[(Arighi et al, 2017)](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-S8-S2)",
    "bc3gm": "[(Lu et al, 2011)](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-S8-S2)",
    "plantnorm": "[(Cho et al, 2017)](https://pubmed.ncbi.nlm.nih.gov/29029598)",
    'nlm_gene': "[(Islamaj et al, 2021)](https://doi.org/10.1016/j.jbi.2021.103779)"
}

dataset_to_citation = {
    "medmentions_full": "mohan2019medmentions",
    "medmentions_st21pv": "mohan2019medmentions",
    "bc5cdr": "li2016bc5cdr",
    "gnormplus": "wei2015gnormplus",
    "ncbi_disease": "dougan2014ncbi",
    "nlmchem": "islamaj2021nlm_chem",
    "craft": "bada2012craft_concept",
    "bc6id": "arighi2017bc6id",
    "bc3gm": "lu2011bc3gn",
    "plantnorm": "cho2017plantnorm",
    'nlm_gene': "islamaj2021nlm_gene"
}

output = pd.DataFrame.from_records(all_stats)
output["source_documents"] = output.dataset.map(dataset_to_doc_type)
output["linked_ontology"] = output.dataset.map(dataset_to_ontology)
for col in ["unique_ents", "total_documents", "total_mentions"]:
    output[col] = output[col].map(lambda x: "{:,}".format(x))

output_subset = (
    output[
        [
            "dataset",
            "total_documents",
            "total_mentions",
            "unique_ents",
            'unique_types',
            "ent_overlap",
            "mention_overlap",
            "source_documents",
            "linked_ontology",
        ]
    ]
    .rename({"unique_ents": "unique_entities", "ent_overlap": "entity_overlap"}, axis=1)
    .round(4)
)



# Make Markdown Tables
markdown_output = output_subset.copy()
markdown_output["dataset"] = markdown_output.dataset.map(
    lambda x: dataset_to_pretty_name[x] + " " + dataset_to_paper_url[x]
)
markdown_output.columns = [
    " ".join(x.split("_")).title() for x in markdown_output.columns
]
print(markdown_output.to_markdown(index=False))


# Make Latex Tables
latex_output = output_subset.copy()
latex_output["dataset"] = latex_output.dataset.map(lambda x: dataset_to_pretty_name[x])
# latex_output['dataset'] = latex_output.dataset.map(lambda x: dataset_to_pretty_name[x] + ' ' + '\cite{%s}' % dataset_to_citation[x])
# latex_output.columns = [" ".join(x.split("_")).title() for x in latex_output.columns]

latex_output = latex_output.rename(
    {
        "dataset": "Dataset",
        "total_documents": "Num Docs",
        "total_mentions": "Mentions",
        "unique_entities": "Unique Ents",
        'unique_types': 'Ent Types',
        "entity_overlap": "Ent. Overlap",
        "mention_overlap": "Ment. Overlap",
        "source_documents": "Doc Type",
        "linked_ontology": "Ontology",
    },
    axis=1,
)
latex_output_1 = latex_output[["Dataset", "Num Docs", "Mentions","Unique Ents",'Ent Types', 'Doc Type', 'Ontology']]
latex_output_2 = latex_output[["Dataset", 'Ent. Overlap', "Ment. Overlap"]]

latex_output_1.columns = ["\textbf{" + x + '}' for x in latex_output_1.columns]
latex_output_2.columns = ["\textbf{" + x + '}' for x in latex_output_2.columns]

print(latex_output_1.to_latex(index=False, escape=False, bold_rows=True))
print(latex_output_2.to_latex(index=False, escape=False, bold_rows=True))




| Dataset                                                                                                            | Total Documents   | Total Mentions   | Unique Entities   |   Unique Types |   Entity Overlap |   Mention Overlap | Source Documents   | Linked Ontology   |
|:-------------------------------------------------------------------------------------------------------------------|:------------------|:-----------------|:------------------|---------------:|-----------------:|------------------:|:-------------------|:------------------|
| MedMentions Full [(Mohan and Li, 2019)](https://github.com/chanzuckerberg/MedMentions)                             | 4,392             | 385,098          | 34,724            |            127 |           0.6199 |            0.8221 | PubMed Abstracts   | UMLS              |
| MedMentions ST21PV [(Mohan and Li, 2019)](https://github.com/chanzuckerberg/MedMentions)                           | 4,392             | 203,282          | 25,419           

In [84]:
pd.DataFrame.to_latex?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0mto_latex[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuf[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_space[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mna_rep[0m[0;34m=[0m[0;34m'NaN'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mformatters[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfloat_format[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparsify[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex_names[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0

## Convert .csv into Markdown table about models


# Compare statistics of all datasets in BigBio

In [17]:
ner_helpers = conhelps.filtered(
    lambda x:
        x.is_bigbio_schema
        and x.is_local
        and "NAMED_ENTITY_DISAMBIGUATION" in x.tasks
        and not x.is_large
)

In [18]:
[(ch.dataset_name, ch.config.name, ch.tasks) for ch in ner_helpers]

[('msh_wsd', 'msh_wsd_bigbio_kb', {'NAMED_ENTITY_DISAMBIGUATION'}),
 ('nlm_wsd',
  'nlm_wsd_non_reviewed_bigbio_kb',
  {'NAMED_ENTITY_DISAMBIGUATION'}),
 ('nlm_wsd', 'nlm_wsd_reviewed_bigbio_kb', {'NAMED_ENTITY_DISAMBIGUATION'})]

In [19]:
all_statistics = []
for ch in ner_helpers:
    try:
        metadata = ner_helpers.for_config_name(ch.config.name).get_metadata()
        for split in metadata.keys():
            s = metadata[split].__dict__
            s['split'] = split
            s['config_name'] = ch.config.name
            s['dataset_name'] = ch.dataset_name
            s['display_name'] = ch.display_name
            all_statistics.append(s)
    except Exception as e:
        print("Exception!!!!")
        print(e)

df = pd.DataFrame(all_statistics)
df

Downloading builder script:   0%|          | 0.00/9.93k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

Downloading and preparing dataset msh_wsd/msh_wsd_bigbio_kb to /nethome/dkartchner3/.cache/huggingface/datasets/bigbio___msh_wsd/msh_wsd_bigbio_kb/1.0.0/21843cdf3630cd9ffb7dfd324f296850bddfdf1c9cb125746eba43ccecb97116...
Exception!!!!
This is a local dataset. Please pass the data_dir kwarg to load_dataset.


Downloading builder script:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

Downloading and preparing dataset nlm_wsd/nlm_wsd_non_reviewed_bigbio_kb to /nethome/dkartchner3/.cache/huggingface/datasets/bigbio___nlm_wsd/nlm_wsd_non_reviewed_bigbio_kb/1.0.0/d24f852bf92e5ec11cbf0973a22d3d58b88550a01b8e70295c169f324ae18d66...
Exception!!!!
This is a local dataset. Please pass the data_dir kwarg to load_dataset.
Downloading and preparing dataset nlm_wsd/nlm_wsd_reviewed_bigbio_kb to /nethome/dkartchner3/.cache/huggingface/datasets/bigbio___nlm_wsd/nlm_wsd_reviewed_bigbio_kb/1.0.0/d24f852bf92e5ec11cbf0973a22d3d58b88550a01b8e70295c169f324ae18d66...
Exception!!!!
This is a local dataset. Please pass the data_dir kwarg to load_dataset.


In [None]:
cols_to_keep = ['dataset_name', 'display_name', 'samples_count', 'passages_count', 'passages_char_count',
    'passages_type_counter', 'entities_count', 'entities_normalized_count',
    'entities_type_counter', 'entities_db_name_counter',
    'entities_unique_db_ids_count', 'split',]

def dict_agg(x):
    output = defaultdict(int)
    for d in x:
        for k, v in d.items():
            output[k] += v
    return dict(output)


# df.loc[~df.dataset_name.isin(['muchmore','twadrl','ask_a_patient', 'mantra_gsc']), cols_to_keep]

aggregated = df.groupby('config_name').agg({
    'display_name': 'first',
    'dataset_name': 'first',
    'samples_count': 'sum', 
    'passages_count': 'sum', 
    'passages_char_count': 'sum',
    'passages_type_counter': lambda x: dict_agg(x), 
    'entities_count': 'sum', 
    'entities_normalized_count': 'sum',
    'entities_type_counter': lambda x: dict_agg(x), 
    'entities_db_name_counter': lambda x: dict_agg(x),
    'entities_unique_db_ids_count': 'sum',
})

# aggregated.loc[~aggregated.dataset_name.isin(['muchmore','twadrl','ask_a_patient', 'mantra_gsc'])].to_excel('../collaboration/dataset_summary.xlsx')
# aggregated.loc[~aggregated.dataset_name.isin(['muchmore','twadrl','ask_a_patient', 'mantra_gsc'])]

aggregated


Unnamed: 0_level_0,display_name,dataset_name,samples_count,passages_count,passages_char_count,passages_type_counter,entities_count,entities_normalized_count,entities_type_counter,entities_db_name_counter,entities_unique_db_ids_count
config_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ask_a_patient_bigbio_kb,AskAPatient,ask_a_patient,173240,173240,2238807,{'social_media_text': 173240},173240,173240,{'social_media_text': 173240},{'SNOMED-CT|AMT': 173240},16548
bc5cdr_bigbio_kb,BC5CDR,bc5cdr,1500,3000,1975456,"{'title': 1500, 'abstract': 1500}",29271,29335,"{'Chemical': 15953, 'Disease': 13318}",{'MESH': 29335},3907
bionlp_st_2019_bb_bigbio_kb,BioNLP 2019 BB,bionlp_st_2019_bb,295,295,305827,{'abstract': 295},3456,3628,"{'Habitat': 1752, 'Microorganism': 1174, 'Phen...","{'OntoBiotope': 2452, 'NCBI_Taxonomy': 1176}",834
biorelex_bigbio_kb,BioRelEx,biorelex,1606,1606,270233,{'sentence': 1606},7880,10593,"{'protein': 3730, 'protein-family': 1125, 'che...","{'uniprot': 3190, 'hgnc': 2947, 'NCBI gene': 2...",3291
cadec_bigbio_kb,CADEC,cadec,1250,1250,575453,{'abstract': 1250},9111,15932,"{'ADR': 6318, 'Drug': 1800, 'Finding': 435, 'D...","{'Snomed CT': 8930, 'Meddra': 6222, '': 780}",1736
cantemist_bigbio_kb,CANTEMIST,cantemist,1301,1301,6344976,{'abstract': 1301},16032,16032,{'MORFOLOGIA_NEOPLASIA': 16032},{'eCIE-O-3.1': 16032},1399
citation_gia_test_collection_bigbio_kb,Citation GIA Test Collection,citation_gia_test_collection,151,302,229882,"{'title': 151, 'abstract': 151}",1382,1382,"{'Gene': 1205, 'FamilyName': 160, 'DomainMotif...",{'': 1382},1
codiesp_X_bigbio_kb,CodiEsp,codiesp,1000,0,0,{},18435,18435,"{'DIAGNOSTICO': 14305, 'PROCEDIMIENTO': 4130}","{'ICD10-CM': 14305, 'ICD10-PCS': 4130}",5377
distemist_entities_bigbio_kb,DisTEMIST,distemist,750,750,1758414,{'clinical_case': 750},8065,0,{'ENFERMEDAD': 8065},{},0
distemist_linking_bigbio_kb,DisTEMIST,distemist,583,583,1246383,{'clinical_case': 583},5136,5374,{'ENFERMEDAD': 5136},{'SNOMED_CT': 5374},2430


In [None]:
all_db_names = defaultdict(list)

aggregated[['dataset_name','']]

In [6]:
metadata = ner_helpers.for_config_name('bc5cdr_bigbio_kb').get_metadata()

Found cached dataset bc5cdr (/home/dkartchner3/.cache/huggingface/datasets/bc5cdr/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
train = metadata['train']

In [14]:
metadata.keys()

dict_keys(['train', 'test', 'validation'])

In [15]:
train.__dict__

{'samples_count': 500,
 'passages_count': 1000,
 'passages_char_count': 652177,
 'passages_type_counter': {'title': 500, 'abstract': 500},
 'entities_count': 9570,
 'entities_normalized_count': 9599,
 'entities_type_counter': {'Chemical': 5207, 'Disease': 4363},
 'entities_db_name_counter': {'MESH': 9599},
 'entities_unique_db_ids_count': 1328,
 'events_count': 0,
 'events_type_counter': {},
 'events_arguments_count': 0,
 'events_arguments_role_counter': {},
 'coreferences_count': 0,
 'relations_count': 15072,
 'relations_type_counter': {'CID': 15072},
 'relations_db_name_counter': {},
 'relations_unique_db_ids_count': 0}

In [7]:
df = pd.read_excel('../collaboration/dataset_summary.xlsx')

In [14]:
import ast
db_names = defaultdict(set)
for d in df.entities_db_name_counter.tolist():
    db_dict = ast.literal_eval(d)
    for x in db_dict.keys():
        db_names[x.lower()].add(x)

db_keys = list(db_names.keys())
db_keys.sort()
db_names = {key: list(db_names[key]) for key in db_keys}
for key, val in db_names.items():
    if len(val) > 1:
        print(key)
    for v in val:
        # if v == key:
        #     continue
        print(v)

with open('../collaboration/db_names.json', 'w') as f:
    f.write(ujson.dumps(list(db_names), indent=2))
# for x in sorted([x.lower() for x in db_names.keys()]):
#     print(x)


chemspider
dbsnp
dbsnp
dbSNP
DrugBank
eCIE-O-3.1
hgnc
HGVS-like
ICD10-CM
ICD10-PCS
intenz
interpro
kegg
Meddra
MedDRA v18.1
mesh
mesh
MESH
miRNA-corpus
ncbi
ncbi gene
NCBI Gene
NCBI gene
ncbi_gene
ncbi_taxon
NCBI_Taxonomy
NCBIGene
OMIM
OntoBiotope
pfam
pubchem:compound
pubchem:substance
Snomed CT
SNOMED_CT
tmVar
UMLS
uniprot


In [55]:
with open('../collaboration/db_names.txt', 'w') as f:
    f.write("\n".join())

['', 'chemspider', 'dbsnp', 'drugbank', 'ecie-o-3.1', 'hgnc', 'hgvs-like', 'icd10-cm', 'icd10-pcs', 'intenz', 'interpro', 'kegg', 'meddra', 'meddra v18.1', 'mesh', 'mirna-corpus', 'ncbi', 'ncbi gene', 'ncbi_gene', 'ncbi_taxon', 'ncbi_taxonomy', 'ncbigene', 'omim', 'ontobiotope', 'pfam', 'pubchem:compound', 'pubchem:substance', 'snomed ct', 'snomed_ct', 'tmvar', 'umls', 'uniprot']


In [38]:
df.entities_db_name_counter.tolist()

["{'MESH': 29335}",
 "{'OntoBiotope': 2452, 'NCBI_Taxonomy': 1176}",
 "{'uniprot': 3190, 'hgnc': 2947, 'NCBI gene': 2946, 'interpro': 816, 'pubchem:compound': 459, 'intenz': 116, 'pfam': 98, 'DrugBank': 13, 'pubchem:substance': 6, 'kegg': 1, 'chemspider': 1}",
 "{'Snomed CT': 8930, 'Meddra': 6222, '': 780}",
 "{'eCIE-O-3.1': 16032}",
 "{'': 1382}",
 "{'ICD10-CM': 14305, 'ICD10-PCS': 4130}",
 '{}',
 "{'SNOMED_CT': 5374}",
 "{'NCBIGene': 6430}",
 "{'ncbi': 4259}",
 "{'ncbi': 2851}",
 "{'UMLS': 385098}",
 "{'UMLS': 203282}",
 "{'miRNA-corpus': 8318}",
 "{'MESH': 6487, 'OMIM': 567}",
 "{'NCBIGene': 18056}",
 "{'MESH': 41300}",
 "{'NCBI Gene': 689, 'HGVS-like': 291, 'dbSNP': 250}",
 "{'mesh': 139, 'ncbi_taxon': 49, 'ncbi_gene': 11, 'tmVar': 9}",
 "{'UMLS': 7202}",
 "{'UMLS': 9079}",
 "{'MedDRA v18.1': 2417}",
 '{}',
 "{'dbSNP': 527}",
 "{'dbsnp': 625}",
 "{'NCBI Gene': 6023, 'dbSNP': 2861}"]

In [3]:
name_list = ujson.load(open('../collaboration/db_names.json'))