In [5]:
import json

from collections import Counter

In [13]:
def load_stat_files(
    start: int = 1, 
    end: int = 21,
    path: str = "data/statistic/featurize_stat/",
    file_naming: str = "featurized_sents_pubmed20n"
):
    total_featurize_stat = Counter()
    total_drug_names_kicked_out = Counter()
    total_reactions_kicked_out = Counter()
    
    max_drug_ents_list_len = 0
    max_reacts_list_len = 0
    
    for file_index in range(start, end):
        s_i = str(file_index)
        zeros = "0" * (4 - len(s_i))
        feat_stat_file = path + file_naming + f"{zeros + s_i}_stat.json"
        with open(feat_stat_file, "r") as json_file:
            file_data = json.load(json_file)
        
        total_featurize_stat += file_data["featurize_stat"]
        total_drug_names_kicked_out += file_data["drug_names_kicked_out"]
        total_reactions_kicked_out += file_data["reactions_kicked_out"]
        try:
            if file_data["featurize_stat"]["max_drug_ents"] > max_drug_ents_list_len:
                max_drug_ents_list_len = file_data["featurize_stat"]["max_drug_ents"]

            if file_data["featurize_stat"]["max_reactions"] > max_reacts_list_len:
                max_reacts_list_len = file_data["featurize_stat"]["max_reactions"]
        except KeyError:
            print(feat_stat_file)
            continue
    total_featurize_stat["max_reactions"] = max_reacts_list_len
    total_featurize_stat["max_drug_ents"] = max_drug_ents_list_len
    
    return total_featurize_stat, total_drug_names_kicked_out, total_reactions_kicked_out


In [14]:
total_featurize_stat, total_drug_names_kicked_out, total_reactions_kicked_out = load_stat_files(end=1016)

data/statistic/featurize_stat/featurized_sents_pubmed20n0654_stat.json


In [15]:
print(json.dumps(total_featurize_stat, indent=4))

{
    "10_combinations_same_sent": 2240,
    "12_combinations_same_sent": 2002,
    "15_combinations_same_sent": 400,
    "18_combinations_same_sent": 162,
    "1_combinations_same_sent": 1647860,
    "21_combinations_same_sent": 48,
    "2_combinations_same_sent": 706106,
    "3_combinations_same_sent": 195509,
    "4_combinations_same_sent": 116237,
    "5_combinations_same_sent": 19775,
    "6_combinations_same_sent": 33327,
    "8_combinations_same_sent": 8079,
    "9_combinations_same_sent": 3143,
    "drug_first": 607331,
    "drugs_checked": 3336289,
    "entity_names_and_entity_pos_names_not_comparable": 1144,
    "max_drug_ents": 105,
    "max_reactions": 40,
    "reaction_first": 4620368,
    "reactions_check_off_set_names_failed": 1007,
    "reactions_checked": 5228706,
    "total_featurized_sentences": 5227699,
    "total_number_drugs": 3105545,
    "total_number_featurized_sents_incl_sent_doubles_possible": 4596103,
    "total_number_reactions": 4053833,
    "7_combination

In [8]:
total_combinations = 0

for key, val in total_featurize_stat.items():
    if key.endswith("_combinations_same_sent"):
        total_combinations += val
print(total_combinations)

2738336


In [6]:
print(total_drug_names_kicked_out)

Counter({'cholesterol': 377491, 'sodium': 55039, 'uracil': 54427, 'ac': 53500, 'peptide': 52240, 'cardiac': 52092, 'calcium': 42878, 'potassium': 39198, 'pred': 38532, 'medium': 35875, 'heparin': 31708, 'propranolol': 30800, 'basal': 23148, 'technetium': 22501, 'cisplatin': 21955, 'dopamine': 19400, 'methotrexate': 18923, 'endocrine': 17580, 'magnesium': 17331, 'doxorubicin': 17331, 'polypeptide': 17163, 'globulin': 16054, 'gallium': 15813, 'ammonium': 15371, 'sal': 14435, 'lithium': 14306, 'tamoxifen': 13850, 'immunoglobulin': 13730, 'nifedipine': 13430, 'adriamycin': 13242, 'lidocaine': 12972, 'chromium': 12330, 'leptin': 12307, 'diazepam': 11858, 'verapamil': 11505, 'captopril': 11487, 'surfactant': 11100, 'paclitaxel': 10615, 'histidine': 10514, 'thallium': 10266, 'pindolol': 10237, '5-fluorouracil': 10219, 'diclofenac': 9759, 'pregnenolone': 9195, 'double-blind': 8900, 'cocaine': 8527, 'beta-carotene': 8480, 'vasopressin': 7956, 'fluoxetine': 7550, 'enalapril': 7347, 'pentoxifylli

In [7]:
print(total_reactions_kicked_out)

Counter({'surgery': 116892, 'hypertension': 73715, 'mass': 60131, 'breast cancer': 53965, 'epinephrine': 50204, 'renin': 46176, 'cardiac arrest': 38348, 'chemotherapy': 37993, 'infarction': 29911, 'infection': 28482, 'cardiac output': 28320, 'blood pressure': 27621, 'pregnancy': 27563, 'myocardial infarction': 26863, 'arthritis': 26395, 'death': 24545, 'asthma': 23861, 'cardiomyopathy': 23746, 'rheumatoid arthritis': 23155, 'lipids': 22834, 'depression': 20731, 'schizophrenia': 19995, 'pain': 19790, 'stress': 19001, 'injury': 18937, 'cardiac death': 18750, 'vitamin d': 18739, 'ulcer': 18342, 'ubiquinone': 17680, 'weight': 17658, 'renal failure': 17479, 'lymphoma': 15798, 'colorectal cancer': 15628, 'gastric cancer': 15444, 'essential hypertension': 15433, 'sudden cardiac death': 14920, 'basal cell carcinoma': 14917, 'acute myocardial infarction': 14208, 'thrombosis': 13970, 'dependence': 13537, 'duodenal ulcer': 13282, 'atrial fibrillation': 13267, 'diabetes mellitus': 13189, 'growth':