In [1]:
import pandas as pd
import os
import json
from collections import defaultdict
import numpy as np
import graphvite
import graphvite.application as gap
from graphvite.dataset import Dataset

ModuleNotFoundError: No module named 'graphvite'

In [2]:
snomed_dir = '/home/dc925/project/data/snomed'
data_dir = 'data'

In [3]:
relations_path = os.path.join(snomed_dir, 'active_relations.txt')
semantic_types_path = os.path.join(snomed_dir, 'semantic_types.txt')
concepts_path = os.path.join(snomed_dir, 'active_concepts.txt')
semgroups_path = os.path.join(snomed_dir, 'SemGroups.txt')

In [5]:
relations = pd.read_csv(relations_path, sep='\t', header=None)
relations.columns = ['CUI1', 'REL', 'CUI2', 'RELA']
relations = relations[-relations.duplicated()]

semantic_types = pd.read_csv(semantic_types_path, sep='\t', header=None)
semantic_types.columns = ['CUI', 'TUI', 'STY']

semantic_groups = pd.read_csv(semgroups_path, sep='|', header=None)
semantic_groups.columns = ['SG', 'SG_string', 'TUI', 'STY']
semantic_groups = semantic_groups.set_index('TUI')

tui2sg = semantic_groups['SG'].to_dict()
semantic_types['SemGroup'] = [tui2sg[tui] for tui in semantic_types['TUI']]
# semantic_types = semantic_types[semantic_types['SemGroup'].isin(include_groups)]

In [7]:
# filter semantic types and groups
# We want to include these groups: ANAT, CHEM, CONC, DEVI, DISO, PHEN, PHYS, PROC
# And exclude semantic types: 
exclude_types = ['Cell', 'Cell Component', 'Embryonic Structure', 'Biomedical or Dental Material', 'Chemical Viewed Functionally', 'Chemical Viewed Structurally', 'Regulation or Law', \
                'Experimental Model of Disease', 'Molecular Function', 'Cell Function', 'Genetic Function']
include_groups = ['CHEM', 'DISO', 'ANAT', 'PROC', 'CONC', 'DEVI', 'PHEN', 'PHYS']
filtered_semantic_types = semantic_types[semantic_types['SemGroup'].isin(include_groups)]
filtered_semantic_types = filtered_semantic_types[-filtered_semantic_types['STY'].isin(exclude_types)]

In [50]:
semantic_types.to_csv('/home/dc925/project/clinical_kge/semantic_info.csv',sep='\t')

In [51]:
filtered_semantic_types.to_csv('/home/dc925/project/clinical_kge/filtered_semantic_info.csv', sep='\t')

In [63]:
filtered_semantic_types

Unnamed: 0,CUI,TUI,STY,SemGroup
0,C0000039,T109,Organic Chemical,CHEM
1,C0000039,T121,Pharmacologic Substance,CHEM
2,C0000052,T116,"Amino Acid, Peptide, or Protein",CHEM
3,C0000052,T126,Enzyme,CHEM
4,C0000097,T109,Organic Chemical,CHEM
...,...,...,...,...
425865,C4759626,T037,Injury or Poisoning,DISO
425866,C4759627,T046,Pathologic Function,DISO
425867,C4759628,T046,Pathologic Function,DISO
425868,C4759629,T170,Intellectual Product,CONC


In [64]:
cui2sg = filtered_semantic_types.set_index('CUI')['SemGroup'].to_dict()
cui2sty = filtered_semantic_types.set_index('CUI')['STY'].to_dict()

In [9]:
active_concepts = pd.read_csv(concepts_path, sep='\t', header=None)
active_concepts.columns = ['CUI', 'STR']
cui2string = active_concepts.set_index('CUI')['STR'].to_dict()
with open(os.path.join(data_dir, 'cui2string.json'), 'w') as fp:
    json.dump(cui2string, fp)

In [10]:
def filter_triplets_by_cuis(triplets, cui_iterable):
    filtered = triplets[(triplets['CUI1'].isin(cui_iterable)) & (triplets['CUI2'].isin(cui_iterable))]
    return filtered

def create_datasets(triplets, data_dir):
    """
    4 settings:
    1. full (with reciprocals and leakage)
    2. no reciprocal relations at all
    3. no leakage, augment train only
    4. no leakage, augment all splits
    """
    
    # Case 1: (with reciprocal relations and leakage)
    case1 = triplets[triplets['RELA'].isin(cleaned_relations)]
    case1 = case1.sample(frac=1) #shuffle
    case1.to_csv(os.path.join(data_dir, 'triplets_case1.txt'), sep='\t', header=None, index=None)

    ds = Dataset(name='case1')
    graph_file = os.path.join(data_dir, 'triplets_case1.txt')
    files = ['case1_train.txt', 'case1_valid.txt', 'case1_test.txt']
    files = [os.path.join(data_dir, f) for f in files]
    portions = [95, 2.5, 2.5]
    ds.edge_split(graph_file, files, portions)
    
    # Case 2: no reprical relations at all, so no leakage
    case2 = triplets[triplets['RELA'].isin(reciprocal_relations_dict.keys())]
    case2 = case2.sample(frac=1)
    case2.to_csv(os.path.join(data_dir, 'triplets_case2.txt'), sep='\t', header=None, index=None)

    ds = Dataset(name='case2')
    graph_file = os.path.join(data_dir, 'triplets_case2.txt')
    files = ['case2_train.txt', 'case2_valid.txt', 'case2_test.txt']
    files = [os.path.join(data_dir, f) for f in files]
    portions = [95, 2.5, 2.5]
    ds.edge_split(graph_file, files, portions)
    
    # Case 3: reciprocal in train only, no leakage
    case3_train = pd.read_csv(os.path.join(data_dir, 'case2_train.txt'), sep='\t', header=None)
    case3_train.columns = ['CUI1', 'RELA', 'CUI2']
    case3_train_augmented = pd.concat([case3_train['CUI2'], case3_train['RELA'].apply(lambda x: reciprocal_relations_dict[x]), case3_train['CUI1']], axis=1)
    case3_train_augmented.columns = ['CUI1', 'RELA', 'CUI2']
    case3_train_augmented = pd.concat([case3_train, case3_train_augmented], axis=0)
    case3_train_augmented = case3_train_augmented[-case3_train_augmented.duplicated()]
    case3_train_augmented.to_csv(os.path.join(data_dir, 'case3_train.txt'), sep='\t', header=None, index=None)
    case3_valid = pd.read_csv(os.path.join(data_dir, 'case2_valid.txt'), sep='\t', header=None)
    case3_valid.columns = ['CUI1', 'RELA', 'CUI2']
    case3_test = pd.read_csv(os.path.join(data_dir, 'case2_test.txt'), sep='\t', header=None)
    case3_test.columns = ['CUI1', 'RELA', 'CUI2']
    case3_valid.to_csv(os.path.join(data_dir, 'case3_valid.txt'), sep='\t', header=None)
    case3_test.to_csv(os.path.join(data_dir, 'case3_test.txt'), sep='\t', header=None)
    
    # Case 4: reciprocal in all, no leakage
    case4_train = case3_train_augmented
    case4_valid = pd.concat([case3_valid['CUI2'], case3_valid['RELA'].apply(lambda x: reciprocal_relations_dict[x]), case3_valid['CUI1']], axis=1)
    case4_valid.columns = ['CUI1', 'RELA', 'CUI2']
    case4_valid = pd.concat([case3_valid, case4_valid], axis=0)
    case4_valid = case4_valid[-case4_valid.duplicated()]
    case4_test = pd.concat([case3_test['CUI2'], case3_test['RELA'].apply(lambda x: reciprocal_relations_dict[x]), case3_test['CUI1']], axis=1)
    case4_test.columns = ['CUI1', 'RELA', 'CUI2']
    case4_test = pd.concat([case3_test, case4_test], axis=0)
    case4_test = case4_test[-case4_test.duplicated()]
    
    case4_train, case4_valid, case4_test = move_unseen_to_train(case4_train, case4_valid, case4_test)

    case4_train.to_csv(os.path.join(data_dir, 'case4_train.txt'), sep='\t', header=None, index=None)
    case4_valid.to_csv(os.path.join(data_dir, 'case4_valid.txt'), sep='\t', header=None, index=None)
    case4_test.to_csv(os.path.join(data_dir, 'case4_test.txt'), sep='\t', header=None, index=None)

def move_unseen_to_train(train, valid, test):
    train_cuis = set(train['CUI1']) | set(train['CUI2'])
    valid_unseen_idx = -((valid['CUI1'].isin(train_cuis)) & (valid['CUI2'].isin(train_cuis)))
    train = pd.concat([train, valid[valid_unseen_idx]], axis=0)
    test_unseen_idx = -((test['CUI1'].isin(train_cuis)) & (test['CUI2'].isin(train_cuis)))
    train = pd.concat([train, test[test_unseen_idx]], axis=0)
    valid = valid[-valid_unseen_idx]
    test = test[-test_unseen_idx]
    return train, valid, test
    
def create_transitive_closure_triplets(tc_file, data_dir, cuis):
    transitive_closure = pd.read_csv(os.path.join(data_dir, tc_file), sep='\t', header=None)
    transitive_closure.columns = ['CUI1', 'RELA', 'CUI2']
    inverse_transitive_closure = pd.concat([transitive_closure['CUI2'], transitive_closure['CUI1']], axis=1)
    inverse_transitive_closure['RELA'] = 'inverse_isa'
    inverse_transitive_closure = inverse_transitive_closure[['CUI2','RELA','CUI1']]
    inverse_transitive_closure.columns = ['CUI1', 'RELA', 'CUI2']
    transitive_closure = pd.concat([transitive_closure, inverse_transitive_closure], axis=0)
    transitive_closure_triplets = filter_triplets_by_cuis(transitive_closure, cuis)
    transitive_closure_triplets.to_csv(os.path.join(data_dir, 'transitive_closure_triplets.txt'), sep='\t', header=None, index=None)
    

In [11]:
#Filter relations on active concepts to get final triplets
#also flipping the directions because UMLS does (tail relation head)
filtered_relations = filter_triplets_by_cuis(relations, cui2string)
filtered_relations['string1'] = [cui2string[cui] for cui in filtered_relations['CUI1']]
filtered_relations['string2'] = [cui2string[cui] for cui in filtered_relations['CUI2']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [12]:
filtered_relations

Unnamed: 0,CUI1,REL,CUI2,RELA,string1,string2
1,C0000052,PAR,C0019495,inverse_isa,"1,4-alpha-Glucan branching enzyme",Hexosyltransferase
2,C0000052,PAR,C0443499,inverse_isa,"1,4-alpha-Glucan branching enzyme",Carbohydrate metabolism disorder marker
3,C0000052,RO,C0523417,has_component,"1,4-alpha-Glucan branching enzyme","1,4-alpha-Glucan branching enzyme measurement"
4,C0000052,RO,C4541016,disposition_of,"1,4-alpha-Glucan branching enzyme",Transferase
5,C0000097,PAR,C0576798,inverse_isa,Methylphenyltetrahydropyridine,Pyridine and pyridine derivative
...,...,...,...,...,...,...
2567070,C4759628,PAR,C3887277,inverse_isa,Pathological fracture of right scapula due to ...,Pathological fracture of right scapula
2567071,C4759628,RO,C0016663,associated_morphology_of,Pathological fracture of right scapula due to ...,Pathological fracture
2567072,C4759628,RO,C0027651,associated_morphology_of,Pathological fracture of right scapula due to ...,Neoplasm
2567073,C4759628,RO,C1290251,cause_of,Pathological fracture of right scapula due to ...,Neoplasm of scapula


In [13]:
relation_counts = filtered_relations['RELA'].value_counts()

In [14]:
filtered_relations['REL'].value_counts()
#RO: has relationship Other than synonymous, narrower, or broader
#PAR: has parent relationship
#CHD: has child relationship
#SY: synonymy
#RB: has a broader relationship
#RN: has a narrower relationship

RO     1150667
CHD     543762
PAR     543762
SY       36776
RN        6525
RB        6525
Name: REL, dtype: int64

In [15]:
#unimportant relations we might take out

relatedness_relations = ["same_as", "possibly_equivalent_to", "associated_with", "temporally_related_to"]
exclude_relations = ["mth_plain_text_form_of", "mth_has_xml_form", "mth_has_plain_text_form", "mth_xml_form_of", "replaced_by", "replaces", "uses_energy", "energy_used_by", "has_dependent", "dependent_of", \
"part_referred_to_by", "relative_to_part_of", "inherent_location_of", "has_inherent_location", "has_process_output", "process_output_of", "has_precondition", "precondition_of", \
"definitional_manifestation_of", "has_definitional_manifestation", "has_technique", "technique_of"]

In [16]:
cleaned_relations = [r for r in relation_counts.index if r not in exclude_relations]
reciprocal_relations = [r for r in cleaned_relations if r not in relatedness_relations]

In [17]:
reciprocal_relations_dict = {}
for i,k in zip(reciprocal_relations[1::2], reciprocal_relations[0::2]):
    reciprocal_relations_dict[i] = k
for i in relatedness_relations:
    reciprocal_relations_dict[i] = i

In [55]:
len(reciprocal_relations_dict) #180 relations total; 176/2=88 reciprocals, 4 symmetric

92

In [56]:
with open(os.path.join(data_dir, 'reciprocal_relations.json'), 'w') as fp:
    json.dump(reciprocal_relations_dict, fp)

In [59]:
# snomed subset
snomed = filter_triplets_by_cuis(filtered_relations, filtered_semantic_types['CUI'])
snomed_triplets = snomed[['CUI1', 'RELA','CUI2']]
snomed_triplets = snomed_triplets[snomed_triplets['RELA'].isin(snomed_triplets['RELA'].value_counts()[snomed_triplets['RELA'].value_counts()>15].index)]


In [53]:
snomed_triplets[snomed_triplets['CUI1']=='C0037585']

Unnamed: 0,CUI1,RELA,CUI2,SG
281998,C0037585,isa,C0870393,CONC
281999,C0037585,isa,C3873651,CONC
282000,C0037585,isa,C3873721,CONC
282001,C0037585,isa,C3873784,CONC


Original relations has 386692 concepts and 2386877 active relations.
After filtering 346108 active concepts, we get 2288017 relations.
After filtering by relevant semantic types/groups, we get 2074088 relations for 293892 concepts, among which 40240 only appear once and 158314 appear less than 5 times.
After filtering out rare relations, we get 2073848 triplets, 293884 concepts, and 170 relations

In [32]:

create_datasets(snomed_triplets, data_dir)
case4_train = pd.read_csv(os.path.join(data_dir, 'case4_train.txt'), sep='\t', header=None)
case4_train.columns = ['CUI1', 'RELA', 'CUI2']
final_cuis = set(case4_train['CUI1'])
create_transitive_closure_triplets('transitive_closure_full.txt', data_dir, final_cuis)

splitting graph ../../../project/clinical_kge/data/triplets_case1.txt into ../../../project/clinical_kge/data/case1_train.txt, ../../../project/clinical_kge/data/case1_valid.txt, ../../../project/clinical_kge/data/case1_test.txt
splitting graph ../../../project/clinical_kge/data/triplets_case2.txt into ../../../project/clinical_kge/data/case2_train.txt, ../../../project/clinical_kge/data/case2_valid.txt, ../../../project/clinical_kge/data/case2_test.txt


In [33]:
rela = pd.DataFrame(snomed_triplets['RELA'].unique())
rela.columns = ['relations']
rela.to_csv('snomed_relations.csv', index=None)
# rela.to_csv('relation_strings.csv', index=None)

In [34]:
snomed_cui2string = snomed.set_index('CUI1')['string1'].to_dict()
with open('snomed_cui2string.json', 'w') as fp:
    json.dump(snomed_cui2string, fp)

In [36]:
# two ways of looking at broader relation type metrics is to break them down to 1. broad types (RO, CHD, PAR, SY, RB, RN) and 2. one-or-many types
broad_rel_types = filtered_relations.set_index('RELA')['REL'].to_dict()

In [37]:
with open(os.path.join(data_dir, 'relation2broad.json'), 'w') as fp:
    json.dump(broad_rel_types, fp)

In [38]:
relation2one_or_many = {}
for rela in set(snomed_triplets['RELA']):
    headlist = []
    taillist = []
#     relation_vocab[rela] = list(snomed_triplets[snomed_triplets['RELA']==rela][['CUI1','CUI2']].itertuples(index=False, name=None))
    pairs = snomed_triplets[snomed_triplets['RELA']==rela][['CUI1','CUI2']]
    head_per_tail = len(pairs) / len(set(pairs['CUI2']))
    tail_per_head = len(pairs) / len(set(pairs['CUI1']))
    if head_per_tail < 1.5 and tail_per_head < 1.5:
        relation2one_or_many[rela] = 'one_to_one'
    elif head_per_tail >= 1.5 and tail_per_head < 1.5:
        relation2one_or_many[rela] = 'many_to_one'
    elif head_per_tail < 1.5 and tail_per_head >= 1.5:
        relation2one_or_many[rela] = 'one_to_many'
    else:
        relation2one_or_many[rela] = 'many_to_many'
    
with open(os.path.join(data_dir, 'relation2oneormany.json'), 'w') as fp:
    json.dump(relation2one_or_many, fp)

In [81]:
##TODO: do the same thing for semantic types/groups (type_one_to_many, group_one_to_many, etc)
snomed_triplets['STY1'] = [cui2sty[cui] for cui in snomed_triplets['CUI1']]
snomed_triplets['STY2'] = [cui2sty[cui] for cui in snomed_triplets['CUI2']]

snomed_triplets['SG1'] = [cui2sg[cui] for cui in snomed_triplets['CUI1']]
snomed_triplets['SG2'] = [cui2sg[cui] for cui in snomed_triplets['CUI2']]


In [82]:
snomed_triplets

Unnamed: 0,CUI1,RELA,CUI2,STY1,STY2,SG1,SG2
1,C0000052,inverse_isa,C0019495,Enzyme,Enzyme,CHEM,CHEM
2,C0000052,inverse_isa,C0443499,Enzyme,Biologically Active Substance,CHEM,CHEM
3,C0000052,has_component,C0523417,Enzyme,Laboratory Procedure,CHEM,PROC
4,C0000052,disposition_of,C4541016,Enzyme,Qualitative Concept,CHEM,CONC
5,C0000097,inverse_isa,C0576798,Hazardous or Poisonous Substance,Organic Chemical,CHEM,CHEM
...,...,...,...,...,...,...,...
2567070,C4759628,inverse_isa,C3887277,Pathologic Function,Pathologic Function,DISO,DISO
2567071,C4759628,associated_morphology_of,C0016663,Pathologic Function,Pathologic Function,DISO,DISO
2567072,C4759628,associated_morphology_of,C0027651,Pathologic Function,Neoplastic Process,DISO,DISO
2567073,C4759628,cause_of,C1290251,Pathologic Function,Neoplastic Process,DISO,DISO


In [95]:
relation2group_oneormany = {}
for rela in set(snomed_triplets['RELA']):
    if rela not in exclude_relations:
        headlist = []
        taillist = []
        pairs = snomed_triplets[snomed_triplets['RELA']==rela][['SG1', 'SG2']]
        num_source = len(set(pairs['SG1']))
        num_target = len(set(pairs['SG2']))
        target_cardinality = (num_target/num_source)
        homo = (sum(pairs['SG1'] == pairs['SG2']) / len(pairs))
        #how homogeneous is this relation? measures whether relation is within same types/groups or not
        if num_source < 1.1 and num_target < 1.1 and homo < 0.9:
            relation2group_oneormany[rela] = 'one_to_one'
        elif num_source < 1.1 and num_target < 1.1 and homo > 0.9:
            relation2group_oneormany[rela] = 'one_to_one_homogeneous'
        elif num_source >= 1.1 and num_target < 1.1:
            relation2group_oneormany[rela] = 'many_to_one'
        elif num_source < 1.1 and num_target >= 1.1:
            relation2group_oneormany[rela] = 'one_to_many'
        elif num_source >= 1.1 and num_target >= 1.1 and homo < 0.9:
            relation2group_oneormany[rela] = 'many_to_many'
        else:
            relation2group_oneormany[rela] = 'many_to_many_homogeneous'
        
#         print(rela, relation2group_oneormany[rela])

#         print('{}: {:.3}, {:.3}'.format(rela, target_cardinality, homo))

with open(os.path.join(data_dir, 'relation2sg_oneormany.json'), 'w') as fp:
    json.dump(relation2group_oneormany, fp)

"""
target cardinality: bigger means the relation spans more groups (heterogeneous)
homogeneity: how often does it occur for concepts within same group

classes:
many_to_one: if it's n>1 to 1
one_to_many: 1 to n>1
many_to_many: n>1 to n>1
multi_homo: n>1 to n>1 and homo
one_to_one: 1 to 1 (not homo)
homogeneous: 1 to 1 and same group

"""

has_indirect_procedure_site many_to_many
has_indirect_procedure_site: 2.5, 0.000719
has_presentation_strength_denominator_unit one_to_many
has_presentation_strength_denominator_unit: 2.0, 0.0
characterized_by many_to_many
characterized_by: 0.4, 0.177
component_of many_to_many
component_of: 1.67, 0.000128
has_direct_procedure_site many_to_many
has_direct_procedure_site: 1.0, 0.000217
presentation_strength_numerator_unit_of many_to_one
presentation_strength_numerator_unit_of: 0.5, 0.0
intent_of many_to_many
intent_of: 0.4, 0.019
has_realization one_to_one_homogeneous
has_realization: 1.0, 1.0
temporal_context_of many_to_one
temporal_context_of: 0.2, 0.00969
has_focus many_to_many
has_focus: 0.667, 0.546
indirect_morphology_of one_to_many
indirect_morphology_of: 4.0, 0.0
has_entire_anatomy_structure many_to_many_homogeneous
has_entire_anatomy_structure: 1.0, 0.999
has_presentation_strength_numerator_unit one_to_many
has_presentation_strength_numerator_unit: 2.0, 0.0
concentration_strength

"\ntarget cardinality: bigger means the relation spans more groups (heterogeneous)\nhomogeneity: how often does it occur for concepts within same group\n\nclasses:\nmany_to_one: if it's n>1 to 1\none_to_many: 1 to n>1\nmany_to_many: n>1 to n>1\nmulti_homo: n>1 to n>1 and homo\none_to_one: 1 to 1 (not homo)\nhomogeneous: 1 to 1 and same group\n\n"