# Data prep
the disease definitions consist of two parts, the first part contains the primary disease classification (i.e. what medical speciality does the disease fall into) and the second, the disease description. Only the second part is required for the comparison to the clinical annotations

0. import nomenclature data
1. put everything in lower case
2. to split the disease on term 'characterized by'
3. keep both parts, classif for different analysis and description for tokenisation


In [40]:
#import re
import pandas as pd

In [94]:
#import files
df_nom=pd.read_csv('processed_data/orpha_nomenclature.csv')
df_dis=pd.read_csv('processed_data/orpha_linearized.csv')
df_nom.columns

Index(['OrphaCode', 'Name', 'DisorderType', 'ClassificationLevel',
       'Definition'],
      dtype='object')

In [95]:
df_dis.set_index(['OrphaCode'], inplace=True)
df_nom['Classification']=df_nom['OrphaCode'].map(df_dis['ClassNames'])

In [96]:
df_nom.to_csv('processed_data/nomenclature_class.csv',index=False)

In [97]:
df_endo=df_nom.loc[df_nom['Classification']=='Rare endocrine disease']
df_endo.shape
df_endo.to_csv('processed_data/endocrine_diseases.csv')

In [98]:
df_bone=df_nom.loc[df_nom['Classification']=='Rare bone disease']
df_bone.to_csv('processed_data/bone_diseases.csv')

In [99]:
fname_endo=('processed_data/endocrine_diseases.csv')
fname_bone='processed_data/bone_diseases.csv'
fname_all='processed_data/nomenclature_class.csv'

In [100]:
def definition_split(fname):
    #import file
    df=pd.read_csv(fname,index_col=False)
    display(df.columns)

    #convert sentences to lowercase
    df['Definition']=df['Definition'].str.lower()
    
    # split sentence into first part with classification and second part with clinical description
    ## looking for term 'characterized by'
    pattern = r'(.*?)\s*characteri(?:zed|sed)+\s*by\s*(.*)'
    df[['ParsedClassification','ParsedDescription']]=df['Definition'].str.extract(pattern).apply(lambda x:x.str.strip())
    
    pattern2= r'\bA rare\b.*?[.!?]'
    #df[['ParsedClassification','parsed 2']]=df['Definition'].str.extract(pattern).apply(lambda x:x.str.strip())
    
    display(df.isna().sum())
    return df
df=definition_split(fname_all)
df.head()

Index(['OrphaCode', 'Name', 'DisorderType', 'ClassificationLevel',
       'Definition', 'Classification'],
      dtype='object')

OrphaCode                  0
Name                       0
DisorderType               0
ClassificationLevel        0
Definition                 0
Classification          1356
ParsedClassification    1261
ParsedDescription       1261
dtype: int64

Unnamed: 0,OrphaCode,Name,DisorderType,ClassificationLevel,Definition,Classification,ParsedClassification,ParsedDescription
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Disease,Disorder,a rare primary bone dysplasia characterized by...,Rare bone disease,a rare primary bone dysplasia,the association of multiple epiphyseal dysplas...
1,58,Alexander disease,Disease,Disorder,a rare neurodegenerative disorder of the astro...,Rare neurologic disease,,
2,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",Disease,Disorder,"multiple epiphyseal dysplasia, with miniepiphy...",Rare bone disease,"multiple epiphyseal dysplasia, with miniepiphy...",strikingly small secondary ossification center...
3,61,Alpha-mannosidosis,Disease,Disorder,an inherited lysosomal storage disorder charac...,Rare inborn errors of metabolism,an inherited lysosomal storage disorder,"immune deficiency, facial and skeletal abnorma..."
4,166029,"Multiple epiphyseal dysplasia, with severe pro...",Disease,Disorder,"multiple epiphyseal dysplasia, with severe pro...",Rare bone disease,"multiple epiphyseal dysplasia, with severe pro...","severe, early-onset dysplasia of the proximal ..."


In [101]:
df=df[['OrphaCode', 'Name', 'ParsedClassification','ParsedDescription','Classification','ClassificationLevel']]
display(df.isna().sum())
df.head()
#drop_na
df=df.loc[(df['ParsedClassification'].notna())& (df['Classification'].notna())]
display(df.isna().sum())
display(df.shape)
df.drop_duplicates(subset=['OrphaCode','Classification'])
display(df.shape)
# no duplicats
df.to_csv('processed_data/nomenclature_class.csv', index=False)

OrphaCode                  0
Name                       0
ParsedClassification    1261
ParsedDescription       1261
Classification          1356
ClassificationLevel        0
dtype: int64

OrphaCode               0
Name                    0
ParsedClassification    0
ParsedDescription       0
Classification          0
ClassificationLevel     0
dtype: int64

(6099, 6)

(6099, 6)

In [83]:
df.columns

Index(['OrphaCode', 'Name', 'ParsedClassification', 'Classification'], dtype='object')

# Entity recognition
so far not worked with pre-trained models - they require more training
thus looking at ranking with fuzzy wuzzy

In [12]:
import fuzzywuzzy
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from pywsd.utils import lemmatize_sentence
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Warming up PyWSD (takes ~10 secs)... took 5.33002781867981 secs.


In [102]:
def drop_na(df):
    ## extract codes and description for the disorder level entities
    orpha_description_disorder_level= df['ParsedDescription'].loc[(df['ParsedClassification'].notna())& (df['ClassificationLevel']=='Disorder')]
    orpha_codes_disorder_level= df['OrphaCode'].loc[(df['ParsedClassification'].notna())& (df['ClassificationLevel']=='Disorder')]

    # check lengths of each column extracted:
    display(len(orpha_codes_disorder_level))
    display(len(orpha_description_disorder_level))
    
    return  orpha_codes_disorder_level , orpha_description_disorder_level
df_orpha, df_def= drop_na(df)

5270

5270

In [106]:
#Function:split the defintion strings into clinical components
#input: the column for the orpha codes and the column containing the disease definition
#output: dictionary containing the orpha codes (keys) with a list of the extracted terms for that orpha code

def extract_clinical_terms(df_codes,df_text, file_name,lemmatize=True):

    stop_words=list(STOP_WORDS)

    token_split=[]
    token_clean=[]
    token_list_clean=[]
    orpha_tokens={}

    lmtzr = WordNetLemmatizer()

    #look at adding (onset, reported, neonatal != neonatal hypotonia, juvenile, adult, manifest ing ation \s) to the pattern

    if lemmatize == True:
        for orpha, row in zip(df_codes,df_text):
            pattern=', |\s+and+\s|includ(?:es|e|ing)|associat(?:es|ed|e|ing)|\.|show(?:s|ing)|marked|resulting|\s+or+\s|present(?:\s|s|ing)|with onset|such as|linked to|combined with|with worsening|with\s'
            row= re.sub(r'\([^)]*\)', '',row) # remove text in parenthesis as not needed
            token_list=re.split(pattern, row)
            # strip
            token_list=[token.strip() for token in token_list]

            #remove stop words and lemmatize terms to improve search and mathcing later
            lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(w) if word not in stop_words]
                      for w in token_list]
            token_clean=[' '.join(lem) for lem in lemmatized]
            #removing empty string elements from list
            token_clean = [ele for ele in token_clean if ele.strip()]
            token_list_clean.append(token_clean)

            #dictionary of orpha code with corresponding list of clinical phenotypes
            orpha_tokens[orpha]=token_clean
    else:
        

    #look at adding (onset, reported, neonatal != neonatal hypotonia, juvenile, adult, manifest ing ation \s) to the pattern

        for orpha, row in zip(df_codes,df_text):
            pattern=', |\s+and+\s|includ(?:es|e|ing)|associat(?:es|ed|e|ing)|\.|show(?:s|ing)|marked|resulting|\s+or+\s|present(?:\s|s|ing)|with onset|such as|linked to|combined with|with worsening|with\s'
            row= re.sub(r'\([^)]*\)', '',row) # remove text in parenthesis as not needed
            token_list=re.split(pattern, row)
            # strip
            token_list=[token.strip() for token in token_list]

            #remove stop words and lemmatize terms to improve search and mathcing later
            remove_stop = [[word for word in word_tokenize(w) if word not in stop_words]
                      for w in token_list]
            token_clean=[' '.join(lem) for lem in remove_stop]
            #removing empty string elements from list
            token_clean = [ele for ele in token_clean if ele.strip()]
            token_list_clean.append(token_clean)

            #dictionary of orpha code with corresponding list of clinical phenotypes
            orpha_tokens[orpha]=token_clean
            with open(f'processed_data/{file_name}.pickle', 'wb') as f:
                pickle.dump(orpha_tokens,f)
    return orpha_tokens
    
orpha_terms=extract_clinical_terms(df_orpha, df_def, file_name='test',lemmatize=False)    

In [27]:
#Function:split the defintion strings into clinical components
#input: the column for the orpha codes and the column containing the disease definition
#output: dictionary containing the orpha codes (keys) with a list of the extracted terms for that orpha code

# without lemmatizer to see if affects results
def extract_clinical_terms(df_codes,df_text,file_name):
    import pickle

    
    stop_words=list(STOP_WORDS)

    token_split=[]
    token_clean=[]
    token_list_clean=[]
    orpha_tokens={}

    lmtzr = WordNetLemmatizer()

    #look at adding (onset, reported, neonatal != neonatal hypotonia, juvenile, adult, manifest ing ation \s) to the pattern

    for orpha, row in zip(df_codes,df_text):
        pattern=', |\s+and+\s|includ(?:es|e|ing)|associat(?:es|ed|e|ing)|\.|show(?:s|ing)|marked|resulting|\s+or+\s|present(?:\s|s|ing)|with onset|such as|linked to|combined with|with worsening|with\s'
        row= re.sub(r'\([^)]*\)', '',row) # remove text in parenthesis as not needed
        token_list=re.split(pattern, row)
        # strip
        token_list=[token.strip() for token in token_list]

        #remove stop words and lemmatize terms to improve search and mathcing later
        remove_stop = [[word for word in word_tokenize(w) if word not in stop_words]
                  for w in token_list]
        token_clean=[' '.join(lem) for lem in remove_stop]
        #removing empty string elements from list
        token_clean = [ele for ele in token_clean if ele.strip()]
        token_list_clean.append(token_clean)

        #dictionary of orpha code with corresponding list of clinical phenotypes
        orpha_tokens[orpha]=token_clean
        with open(f'processed_data/{file_name}.pickle', 'wb') as f:
            pickle.dump(orpha_tokens,f)
    return orpha_tokens
    
orpha_terms=extract_clinical_terms(df['OrphaCode'].loc[df['ParsedClassification'].notna()],df['ParsedDescription'].loc[df['ParsedClassification'].notna()],'orpha_ner')    

In [108]:
#Function : remove html encoded elements in the text e.g. html code fro alpha ect.

def replace_html_entities(match):
    import html
    import html.entities
    entity = match.group(0)
    if entity in html.entities.html5:
        return chr(html.entities.html5[entity])
    else:
        return entity

# Decode the HTML-encoded text in the dictionary
decoded_orpha_terms = {}
for key, value in orpha_terms.items():
    if isinstance(key, (str, bytes)):
        decoded_key = html.unescape(re.sub(r'&[a-zA-Z0-9#]+;', replace_html_entities, key))
    else:
        decoded_key = key

    if isinstance(value, str):
        decoded_value = html.unescape(re.sub(r'&[a-zA-Z0-9#]+;', replace_html_entities, value))
    else:
        decoded_value = value

    decoded_orpha_terms[decoded_key] = decoded_value

# Now the 'decoded_terms' dictionary contains the decoded data
#print(decoded_terms)



display(len(decoded_orpha_terms))
display(len(orpha_terms))
decoded_orpha_terms.items()


5270

5270

dict_items([(166024, ['association multiple epiphyseal dysplasia', 'macrocephaly', 'dysmorphic facial features', 'patients normal stature', 'joint swelling', 'genu valgum', 'additional reported manifestations', 'clinodactyly', 'spindle-shaped fingers', 'pectus excavatum']), (166032, ['strikingly small secondary ossification centers', 'joints', 'severe bone dysplasia proximal femoral heads', 'short stature', 'increased lumbar lordosis', 'genua vara', 'generalized joint laxity reported']), (61, ['immune deficiency', 'facial', 'skeletal abnormalities', 'hearing impairment', 'intellectual deficit']), (166029, ['severe', 'early-onset dysplasia proximal femurs', 'complete absence secondary ossification centers', 'abnormal development femoral necks', 'gait abnormality', 'mild short stature', 'arthralgia', 'joint stiffness', 'limited mobility hips', 'irregular acetabula', 'hip', 'knee pain', 'coxa vara', 'mild spinal changes']), (166038, ['disproportionate short stature', 'short limbs', 'digit

# HPO Mapping

In [107]:
#function prepare hpo df for mapping
#input: HPO csv file (not determined in argument)
# output : datafram with prepped terms
def prep_hpo_file():
    df_hpo=pd.read_csv('processed_data\HPO.csv')
    # put all in lower case and remove commas and dash
    # convert terms to list, so that fuzzy can iterate over it
    #df_hpo.drop(index=df.index[0], axis=0, inplace=True)
    df_hpo['Term']=df_hpo['Term'].str.lower().replace(',','').replace('-','')
    hpo_list=df_hpo['Term'].to_list()
    hpo_list.sort()
    
    return hpo_list
hpo_list=prep_hpo_file()

In [109]:
# FOR TESTING PRUPOSES ONLY
# create a small test dictionary
# Python code to convert into dictionary
def Convert(tup, di):
    di = dict(tup)
    return di
     
# Driver Code
tups = list(decoded_orpha_terms.items())[0:100]
dictionary = {}
dictionary=Convert(tups, dictionary)
list(dictionary.items())[0][1]

['association multiple epiphyseal dysplasia',
 'macrocephaly',
 'dysmorphic facial features',
 'patients normal stature',
 'joint swelling',
 'genu valgum',
 'additional reported manifestations',
 'clinodactyly',
 'spindle-shaped fingers',
 'pectus excavatum']

In [110]:
#Function:Matching of extracted Orphanet terms to the HPO terms 
#Input 1: dictionary of extracted orphanet terms, key= orphacode, values= list of extracted terms
#Input 2: list of HPO terms for matching
#output 1: dictionary of the HPO matched terms, key = orphacode, value = list of HPO terms that have been matched
#Output 2: print of number of input terms and number of matches, where the ratio is above 90

def match_HPO_to_extracted_terms(dictionary, hpo_list):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import logging
    import pickle
    
    logging.basicConfig(filename='fuzzy_hpo_match.log', encoding='utf-8', level=logging.INFO,\
                       format='%(asctime)s:%(name)s:%(message)s')

    word_count=0
    match_count=0
    orpha_count=0
    returned_terms={}



    for orpha, disease in dictionary.items():
        HPO_list=[]
        orpha_count+=1
        for word in disease:
            word_count+=1
            hpo_match=process.extract(word, hpo_list, scorer=fuzz.ratio, limit=1)
            #
            if hpo_match[0][1]>90:
                match_count+=1
                logging.info('{} : {} : {}, orpha count= {}, word count = {}, match count= {}'\
                             .format(orpha,word,hpo_match, orpha_count, word_count, match_count))
                HPO_list.append(hpo_match[0][0])
        if len(HPO_list)==0:
                HPO_list.append(None)
        returned_terms[orpha]=HPO_list

   # print(f'{match_count} hpo matchess were found out of {word_count} total input terms')
    

    with open ('extracted_HPO_terms_ratio.pickle', 'wb') as f:
        pickle.dump(returned_terms,f)

    return returned_terms
result=match_HPO_to_extracted_terms(decoded_orpha_terms,hpo_list)

KeyboardInterrupt: 

In [22]:
#Function:Matching of extracted Orphanet terms to the HPO terms 
#Input 1: dictionary of extracted orphanet terms, key= orphacode, values= list of extracted terms
#Input 2: list of HPO terms for matching
#output 1: dictionary of the HPO matched terms, key = orphacode, value = list of HPO terms that have been matched
#Output 2: print of number of input terms and number of matches, where the ratio is above 90

def match_HPO_to_extracted_terms_all(dictionary, hpo_list):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import logging
    import pickle
    
    logging.basicConfig(filename='fuzzy_hpo_match_all.log', encoding='utf-8', level=logging.INFO,\
                       format='%(asctime)s:%(name)s:%(message)s')

    word_count=0
    match_count=0
    orpha_count=0
    returned_terms={}



    for orpha, disease in dictionary.items():
        HPO_list=[]
        orpha_count+=1
        for word in disease:
            word_count+=1
            hpo_match=process.extract(word, hpo_list, limit=1)
            #
            if hpo_match[0][1]>90:
                match_count+=1
                logging.info('{} : {} : {}, orpha count= {}, word count = {}, match count= {}'\
                             .format(orpha,word,hpo_match, orpha_count, word_count, match_count))
                HPO_list.append(hpo_match[0][0])
        if len(HPO_list)==0:
                HPO_list.append(None)
        returned_terms[orpha]=HPO_list

    print(f'{match_count} hpo matchess were found out of {word_count} total input terms')
    

    with open ('extracted_HPO_terms_all.pickle', 'wb') as f:
        pickle.dump(returned_terms,f)

    return returned_terms
result_all=match_HPO_to_extracted_terms_all(decoded_orpha_terms,hpo_list)

14701 hpo matchess were found out of 53618 total input terms


In [29]:
list(dictionary.items())

[(166024,
  ['association multiple epiphyseal dysplasia',
   'macrocephaly',
   'dysmorphic facial feature',
   'patient normal stature',
   'joint swelling',
   'genu valgum',
   'additional reported manifestation',
   'clinodactyly',
   'spindle-shaped finger',
   'pectus excavatum']),
 (166032,
  ['strikingly small secondary ossification center',
   'joint',
   'severe bone dysplasia proximal femoral head',
   'short stature',
   'increased lumbar lordosis',
   'genu vara',
   'generalized joint laxity reported'])]

In [113]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import logging
import pickle
    
logging.basicConfig(filename='manual_curation.log', encoding='utf-8', level=logging.INFO,\
                       format='%(name)s:%(message)s')


def manual_curation(dictionary, start_key):
    
    curated_list=[]
    word_count=0
    orpha_count=0

    stop_flag=False
    #start_key=start_key

    for orpha, disease in sorted(dictionary.items()):
        if orpha<start_key:
            continue
        if stop_flag:
            break

        HPO_list=[]
        orpha_count+=1
        list_len=len(disease)
        print(f'orpha code : {orpha}, orpha count:')
        
        print(f'number of words in list = {list_len}')


        for word in disease:
            if stop_flag:
                break
            
            word_list=[]
            word_count+=1
           # print(f'word number {word_count}')
            hpo_match=process.extract(word, hpo_list, scorer=fuzz.ratio, limit=2)
            
            logging.info('{} : {} : {}, orpha count= {}, word count = {}'\
                             .format(orpha,word,hpo_match, orpha_count, word_count))

            if hpo_match[0][1]>90 and hpo_match[0][1]<96:

                print(f'{word} : {hpo_match}')
                choice= input()
               # print(choice)

                try:
                    if choice != '0' and choice !='m' and choice !='f':
                        word_list.append(orpha)
                        word_list.append(word)
                        word_list.append(hpo_match[int(choice)-1][0])
                        curated_list.append(word_list)
                    #    print(choice)


                    elif choice == 'm':
                        word_list.append(orpha)
                        word_list.append(word)
                        word_list.append('m')
                        curated_list.append(word_list)
                     #   print(choice)
                    elif choice =='f':
                        stop_flag=True
                      #  print(choice)

                        with open(f'processed_data\pickle\manual_curation_{orpha}_{word_count}.pickle', 'wb') as f:
                            pickle.dump(curated_list,f)

                        break
                except ValueError:
                    continue

        with open(f'processed_data\pickle\manual_curation_{orpha}_{word_count}.pickle', 'wb') as f:
            pickle.dump(curated_list,f)

    return curated_list

manual_curation(decoded_orpha_terms, 0)        

orpha code : 6, orpha count:
number of words in list = 1
orpha code : 7, orpha count:
number of words in list = 3
orpha code : 14, orpha count:
number of words in list = 7
orpha code : 15, orpha count:
number of words in list = 6
orpha code : 16, orpha count:
number of words in list = 7
orpha code : 17, orpha count:
number of words in list = 11
orpha code : 18, orpha count:
number of words in list = 7
orpha code : 20, orpha count:
number of words in list = 3
orpha code : 23, orpha count:
number of words in list = 13
orpha code : 24, orpha count:
number of words in list = 9
orpha code : 25, orpha count:
number of words in list = 3
orpha code : 26, orpha count:
number of words in list = 7
seizures : [('seizure', 93), ('seizure cluster', 70)]
1
orpha code : 27, orpha count:
number of words in list = 8
orpha code : 28, orpha count:
number of words in list = 9
orpha code : 30, orpha count:
number of words in list = 8
orpha code : 31, orpha count:
number of words in list = 11
seizures : [('s

KeyboardInterrupt: Interrupted by user

In [140]:
curated_list

#split and/or

[[166024,
  'association multiple epiphyseal dysplasia',
  'multiple epiphyseal dysplasia'],
 [166032, 'strikingly small secondary ossification center', 'm'],
 [166032,
  'severe bone dysplasia proximal femoral head',
  'dysplasia of the femoral head'],
 [166032, 'increased lumbar lordosis', 'm'],
 [166032, 'genu vara', 'genu varum'],
 [166032, 'generalized joint laxity reported', 'generalized joint laxity'],
 [61, 'skeletal abnormality', 'm'],
 [61, 'intellectual deficit', 'intellectual disability'],
 [166029, 'early-onset dysplasia proximal femur', 'm'],
 [166029, 'complete absence secondary ossification center', 'm'],
 [166029,
  'abnormal development femoral neck',
  'abnormal femoral neck morphology'],
 [166029, 'gait abnormality', 'm'],
 [166029, 'limited mobility hip', 'm'],
 [166029, 'irregular acetabulum', 'irregular acetabular roof'],
 [166038, 'short limb', 'm'],
 [166038, 'tracheobronchial malacia', 'tracheobronchomalacia']]

In [30]:
        word_list=[]
        mapped_words=process.extract(word, hpo_list, scorer = fuzz.ratio, limit=3)
        print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.ratio, limit=3))
        choice= input()
        if choice != 0 and choice !='m':
            word_list.append(word)
            word_list.append(mapped_words[choice])
            curated_list.append()
        
        # better hip give 55 and gives immunodeficiency for immune deficiency, intellectual disability fairly good, 79

166024


TypeError: expected string or bytes-like object