# Data prep
the disease definitions consist of two parts, the first part contains the primary disease classification (i.e. what medical speciality does the disease fall into) and the second, the disease description. Only the second part is required for the comparison to the clinical annotations

0. import nomenclature data
1. put everything in lower case
2. to split the disease on term 'characterized by'
3. keep both parts, classif for different analysis and description for tokenisation


In [1]:
#import re
import pandas as pd

df=pd.read_csv('processed_data/orpha_nomenclature.csv')

In [2]:
#convert sentences to lowercase
df['Definition']=df['Definition'].str.lower()

In [3]:
df.columns

Index(['OrphaCode', 'Name', 'DisorderType', 'ClassificationLevel',
       'Definition'],
      dtype='object')

In [6]:
test='A rare severe, X-linked, neurodevelopmental disorder characterised by rapid developmental regression in infancy, partial or complete loss of purposeful hand movements, loss of speech, gait abnormalities, and stereotypic hand movements, commonly associated with deceleration of head growth, severe intellectual disability, seizures, and breathing abnormalities. The disorder has a progressive clinical course and \
        may associate various comorbidities including gastrointestinal diseases, scoliosis, and behavioral disorders.'

#split_results =re.split(r'characteri(?:zed|sed)+\sby',test)[1]

pattern = r'(.*?)\s*characteri(?:zed|sed)+\s*by\s*(.*)'

df[['ParsedClassification','ParsedDescription']]=df['Definition'].str.extract(pattern).apply(lambda x:x.str.strip())


    

In [7]:
#df.drop(labels=['ParsedClassification','ParsedDescription'], axis=1,inplace=True)
display(df['ParsedClassification'][0])

df['ParsedDescription'][0:10]
df.isna().sum()

'a rare primary bone dysplasia'

OrphaCode                  0
Name                       0
DisorderType               0
ClassificationLevel        0
Definition                 0
ParsedClassification    1261
ParsedDescription       1261
dtype: int64

In [8]:
def_to_check=df[['Definition','ClassificationLevel']].loc[df['ParsedClassification'].isna()]
def_to_check.to_csv('processed_data\def_to_check.csv')

Trying to split on several different patterns, but proving difficult, COME BACK TO
from trying three patterns gives back 600 results, which is 2/3 of the definitions at disorder level

In [9]:
def_to_check['ClassificationLevel'].value_counts()
# in general need to drop group of disorders and subtype of disorder
# from looking at the left over definitions we need to include the following terms in the extractions

pattern1 = r'(.*?)\s*manifest(?:s|ing)\s*(.*)'
pattern2 = r'(.*?)\s*present(?:s|ing)\s*(.*)'
pattern3= r'(.*?)\s*characterized\s*(.*)'

def_to_check[['ParsedClassification','ParsedDescription']]=def_to_check['Definition'].str.extract(pattern3).apply(lambda x:x.str.strip())
def_to_check.notna().sum()

Definition              1261
ClassificationLevel     1261
ParsedClassification     209
ParsedDescription        209
dtype: int64

In [None]:
[(.*?)\s*manifest(?:s|ing)\s*(.*)']
[(.*?)\s*present(?:s|ing)\s*(.*)']
 [(.*?)\s*characterized\s*(.*)']

# Entity recognition
so far not worked with pre-trained models - they require more training
thus looking at ranking with fuzzy wuzzy

In [11]:
import fuzzywuzzy
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from pywsd.utils import lemmatize_sentence
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gemma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [12]:
display(df.shape)
df.head()

(8527, 7)

Unnamed: 0,OrphaCode,Name,DisorderType,ClassificationLevel,Definition,ParsedClassification,ParsedDescription
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Disease,Disorder,a rare primary bone dysplasia characterized by...,a rare primary bone dysplasia,the association of multiple epiphyseal dysplas...
1,58,Alexander disease,Disease,Disorder,a rare neurodegenerative disorder of the astro...,,
2,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",Disease,Disorder,"multiple epiphyseal dysplasia, with miniepiphy...","multiple epiphyseal dysplasia, with miniepiphy...",strikingly small secondary ossification center...
3,61,Alpha-mannosidosis,Disease,Disorder,an inherited lysosomal storage disorder charac...,an inherited lysosomal storage disorder,"immune deficiency, facial and skeletal abnorma..."
4,166029,"Multiple epiphyseal dysplasia, with severe pro...",Disease,Disorder,"multiple epiphyseal dysplasia, with severe pro...","multiple epiphyseal dysplasia, with severe pro...","severe, early-onset dysplasia of the proximal ..."


In [13]:
len(df['ParsedDescription'].loc[df['ParsedClassification'].notna()])
## extract codes and description for the disorder level entities
orpha_description_disorder_level= df['ParsedDescription'].loc[(df['ParsedClassification'].notna())& (df['ClassificationLevel']=='Disorder')]
orpha_codes_disorder_level= df['OrphaCode'].loc[(df['ParsedClassification'].notna())& (df['ClassificationLevel']=='Disorder')]

# check lengths of each column extracted:
display(len(orpha_codes_disorder_level))
len(orpha_description_disorder_level)

5884

5884

5954

5954

In [14]:
#Function:split the defintion strings into clinical components
#input: the column for the orpha codes and the column containing the disease definition
#output: dictionary containing the orpha codes (keys) with a list of the extracted terms for that orpha code

def extract_clinical_terms(df_codes,df_text):

    stop_words=list(STOP_WORDS)

    token_split=[]
    token_clean=[]
    token_list_clean=[]
    orpha_tokens={}

    lmtzr = WordNetLemmatizer()

    #look at adding (onset, reported, neonatal != neonatal hypotonia, juvenile, adult, manifest ing ation \s) to the pattern

    for orpha, row in zip(df_codes,df_text):
        pattern=', |\s+and+\s|includ(?:es|e|ing)|associat(?:es|ed|e|ing)|\.|show(?:s|ing)|marked|resulting|\s+or+\s|present(?:\s|s|ing)|with onset|such as|linked to|combined with|with worsening|with\s'
        row= re.sub(r'\([^)]*\)', '',row) # remove text in parenthesis as not needed
        token_list=re.split(pattern, row)
        # strip
        token_list=[token.strip() for token in token_list]

        #remove stop words and lemmatize terms to improve search and mathcing later
        lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(w) if word not in stop_words]
                  for w in token_list]
        token_clean=[' '.join(lem) for lem in lemmatized]
        #removing empty string elements from list
        token_clean = [ele for ele in token_clean if ele.strip()]
        token_list_clean.append(token_clean)

        #dictionary of orpha code with corresponding list of clinical phenotypes
        orpha_tokens[orpha]=token_clean
    return orpha_tokens
    
orpha_terms=extract_clinical_terms(orpha_codes_disorder_level,orpha_description_disorder_level)    

In [27]:
#Function:split the defintion strings into clinical components
#input: the column for the orpha codes and the column containing the disease definition
#output: dictionary containing the orpha codes (keys) with a list of the extracted terms for that orpha code

# without lemmatizer to see if affects results
def extract_clinical_terms(df_codes,df_text,file_name):
    import pickle

    
    stop_words=list(STOP_WORDS)

    token_split=[]
    token_clean=[]
    token_list_clean=[]
    orpha_tokens={}

    lmtzr = WordNetLemmatizer()

    #look at adding (onset, reported, neonatal != neonatal hypotonia, juvenile, adult, manifest ing ation \s) to the pattern

    for orpha, row in zip(df_codes,df_text):
        pattern=', |\s+and+\s|includ(?:es|e|ing)|associat(?:es|ed|e|ing)|\.|show(?:s|ing)|marked|resulting|\s+or+\s|present(?:\s|s|ing)|with onset|such as|linked to|combined with|with worsening|with\s'
        row= re.sub(r'\([^)]*\)', '',row) # remove text in parenthesis as not needed
        token_list=re.split(pattern, row)
        # strip
        token_list=[token.strip() for token in token_list]

        #remove stop words and lemmatize terms to improve search and mathcing later
        remove_stop = [[word for word in word_tokenize(w) if word not in stop_words]
                  for w in token_list]
        token_clean=[' '.join(lem) for lem in remove_stop]
        #removing empty string elements from list
        token_clean = [ele for ele in token_clean if ele.strip()]
        token_list_clean.append(token_clean)

        #dictionary of orpha code with corresponding list of clinical phenotypes
        orpha_tokens[orpha]=token_clean
        with open(f'processed_data/{file_name}.pickle', 'wb') as f:
            pickle.dump(orpha_tokens,f)
    return orpha_tokens
    
orpha_terms=extract_clinical_terms(df['OrphaCode'].loc[df['ParsedClassification'].notna()],df['ParsedDescription'].loc[df['ParsedClassification'].notna()],'orpha_ner')    

In [15]:


def replace_html_entities(match):
    import html
    import html.entities
    entity = match.group(0)
    if entity in html.entities.html5:
        return chr(html.entities.html5[entity])
    else:
        return entity

# Decode the HTML-encoded text in the dictionary
decoded_orpha_terms = {}
for key, value in orpha_terms.items():
    if isinstance(key, (str, bytes)):
        decoded_key = html.unescape(re.sub(r'&[a-zA-Z0-9#]+;', replace_html_entities, key))
    else:
        decoded_key = key

    if isinstance(value, str):
        decoded_value = html.unescape(re.sub(r'&[a-zA-Z0-9#]+;', replace_html_entities, value))
    else:
        decoded_value = value

    decoded_orpha_terms[decoded_key] = decoded_value

# Now the 'decoded_terms' dictionary contains the decoded data
#print(decoded_terms)



display(len(decoded_orpha_terms))
display(len(orpha_terms))
list(decoded_orpha_terms.items())[0:10]


5884

5884

[(166024,
  ['association multiple epiphyseal dysplasia',
   'macrocephaly',
   'dysmorphic facial feature',
   'patient normal stature',
   'joint swelling',
   'genu valgum',
   'additional reported manifestation',
   'clinodactyly',
   'spindle-shaped finger',
   'pectus excavatum']),
 (166032,
  ['strikingly small secondary ossification center',
   'joint',
   'severe bone dysplasia proximal femoral head',
   'short stature',
   'increased lumbar lordosis',
   'genu vara',
   'generalized joint laxity reported']),
 (61,
  ['immune deficiency',
   'facial',
   'skeletal abnormality',
   'hearing impairment',
   'intellectual deficit']),
 (166029,
  ['severe',
   'early-onset dysplasia proximal femur',
   'complete absence secondary ossification center',
   'abnormal development femoral neck',
   'gait abnormality',
   'mild short stature',
   'arthralgia',
   'joint stiffness',
   'limited mobility hip',
   'irregular acetabulum',
   'hip',
   'knee pain',
   'coxa vara',
   'mild s

# HPO comparison

In [16]:
df_hpo=pd.read_csv('processed_data\HPO.csv')

In [17]:
df_hpo.columns

Index(['Unnamed: 0', 'Term', 'ID', 'Definition', 'Synonyms', 'HPO_ID'], dtype='object')

In [18]:
# put all in lower case and remove commas and dash
# convert terms to list, so that fuzzy can iterate over it
#df_hpo.drop(index=df.index[0], axis=0, inplace=True)
df_hpo['Term']=df_hpo['Term'].str.lower().replace(',','').replace('-','')
hpo_list=df_hpo['Term'].to_list()
hpo_list.sort()
type(orpha_terms)

dict

In [321]:
# FOR TESTING PRUPOSES ONLY
# create a small test dictionary
# Python code to convert into dictionary
def Convert(tup, di):
    di = dict(tup)
    return di
     
# Driver Code
tups = list(decoded_orpha_terms.items())[0:100]
dictionary = {}
dictionary=Convert(tups, dictionary)

In [21]:
#Function:Matching of extracted Orphanet terms to the HPO terms 
#Input 1: dictionary of extracted orphanet terms, key= orphacode, values= list of extracted terms
#Input 2: list of HPO terms for matching
#output 1: dictionary of the HPO matched terms, key = orphacode, value = list of HPO terms that have been matched
#Output 2: print of number of input terms and number of matches, where the ratio is above 90

def match_HPO_to_extracted_terms(dictionary, hpo_list):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import logging
    import pickle
    
    logging.basicConfig(filename='fuzzy_hpo_match.log', encoding='utf-8', level=logging.INFO,\
                       format='%(asctime)s:%(name)s:%(message)s')

    word_count=0
    match_count=0
    orpha_count=0
    returned_terms={}



    for orpha, disease in dictionary.items():
        HPO_list=[]
        orpha_count+=1
        for word in disease:
            word_count+=1
            hpo_match=process.extract(word, hpo_list, scorer=fuzz.ratio, limit=1)
            #
            if hpo_match[0][1]>90:
                match_count+=1
                logging.info('{} : {} : {}, orpha count= {}, word count = {}, match count= {}'\
                             .format(orpha,word,hpo_match, orpha_count, word_count, match_count))
                HPO_list.append(hpo_match[0][0])
        if len(HPO_list)==0:
                HPO_list.append(None)
        returned_terms[orpha]=HPO_list

    print(f'{match_count} hpo matchess were found out of {word_count} total input terms')
    

    with open ('extracted_HPO_terms_ratio.pickle', 'wb') as f:
        pickle.dump(returned_terms,f)

    return returned_terms
result=match_HPO_to_extracted_terms(decoded_orpha_terms,hpo_list)

12015 hpo matchess were found out of 53618 total input terms


In [22]:
#Function:Matching of extracted Orphanet terms to the HPO terms 
#Input 1: dictionary of extracted orphanet terms, key= orphacode, values= list of extracted terms
#Input 2: list of HPO terms for matching
#output 1: dictionary of the HPO matched terms, key = orphacode, value = list of HPO terms that have been matched
#Output 2: print of number of input terms and number of matches, where the ratio is above 90

def match_HPO_to_extracted_terms_all(dictionary, hpo_list):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import logging
    import pickle
    
    logging.basicConfig(filename='fuzzy_hpo_match_all.log', encoding='utf-8', level=logging.INFO,\
                       format='%(asctime)s:%(name)s:%(message)s')

    word_count=0
    match_count=0
    orpha_count=0
    returned_terms={}



    for orpha, disease in dictionary.items():
        HPO_list=[]
        orpha_count+=1
        for word in disease:
            word_count+=1
            hpo_match=process.extract(word, hpo_list, limit=1)
            #
            if hpo_match[0][1]>90:
                match_count+=1
                logging.info('{} : {} : {}, orpha count= {}, word count = {}, match count= {}'\
                             .format(orpha,word,hpo_match, orpha_count, word_count, match_count))
                HPO_list.append(hpo_match[0][0])
        if len(HPO_list)==0:
                HPO_list.append(None)
        returned_terms[orpha]=HPO_list

    print(f'{match_count} hpo matchess were found out of {word_count} total input terms')
    

    with open ('extracted_HPO_terms_all.pickle', 'wb') as f:
        pickle.dump(returned_terms,f)

    return returned_terms
result_all=match_HPO_to_extracted_terms_all(decoded_orpha_terms,hpo_list)

14701 hpo matchess were found out of 53618 total input terms


NameError: name 'result' is not defined

In [229]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

word_count=0
match_count=0

for disease in terms_to_compare[1:10]:

    for word in disease:
        word_count+=1
        hpo_match=process.extract(word, hpo_list, scorer = fuzz.ratio, limit=1)
        if hpo_match[0][1]>90:
            match_count+=1
            print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.ratio, limit=1))
print(f'{match_count} hpo matchess were found out of {word_count} total input terms')

# consider getting rid of modifiers from the HPO terminology

#for now i might just go with 100 or 95 cut off, and then work on improving the matching for the model
        
    # immune deficiency not great, hip 55, id 79, hyptonia 100 not bad

short stature :  [('short stature', 100)]
immune deficiency :  [('immunodeficiency', 91)]
hearing impairment :  [('hearing impairment', 100)]
severe :  [('severe', 100)]
mild short stature :  [('mild short stature', 100)]
arthralgia :  [('arthralgia', 100)]
joint stiffness :  [('joint stiffness', 100)]
knee pain :  [('knee pain', 100)]
coxa vara :  [('coxa vara', 100)]
disproportionate short stature :  [('disproportionate short stature', 100)]
brachydactyly :  [('brachydactyly', 100)]
retinal degeneration :  [('retinal degeneration', 100)]
hypotonia :  [('hypotonia', 100)]
coarse facial features :  [('coarse facial features', 100)]
ichthyosis :  [('ichthyosis', 100)]
hepatomegaly :  [('hepatomegaly', 100)]
progressive neurologic deterioration :  [('progressive neurologic deterioration', 100)]
hydrocephalus :  [('hydrocephalus', 100)]
18 hpo matchess were found out of 63 total input terms


In [232]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

word_count=0
match_count=0

for disease in terms_to_compare[1:10]:

    for word in disease:
        word_count+=1
        hpo_match=process.extract(word, hpo_list, scorer = fuzz.token_set_ratio, limit=1)
        if hpo_match[0][1]>90:
            match_count+=1
            print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.token_set_ratio, limit=1))
print(f'{match_count} hpo matchess were found out of {word_count} total input terms')

# consider getting rid of modifiers from the HPO terminology

#for now i might just go with 100 or 95 cut off, and then work on improving the matching for the model
        
    # immune deficiency not great, hip 55, id 79, hyptonia 100 not bad

joints :  [('abnormality of the radioulnar joints', 100)]
severe bone dysplasia proximal femoral heads :  [('proximal', 100)]
short stature :  [('asymmetric short stature', 100)]
generalized joint laxity reported :  [('generalized', 100)]
facial :  [('abnormal facial artery morphology', 100)]
hearing impairment :  [('adult onset sensorineural hearing impairment', 100)]
severe :  [('intellectual disability, severe', 100)]
early-onset dysplasia proximal femurs :  [('onset', 100)]
mild short stature :  [('mild', 100)]
arthralgia :  [('arthralgia', 100)]
joint stiffness :  [('joint stiffness', 100)]
hip :  [('abnormal hip bone morphology', 100)]
knee pain :  [('knee pain', 100)]
coxa vara :  [('coxa vara', 100)]
mild spinal changes :  [('mild', 100)]
disproportionate short stature :  [('disproportionate short stature', 100)]
short limbs :  [('short lower limbs', 100)]
digits :  [('autoamputation of digits', 100)]
progressive thoracolumbar scoliosis :  [('progressive', 100)]
progression :  

KeyboardInterrupt: 

In [233]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

word_count=0
match_count=0

for disease in terms_to_compare[1:10]:

    for word in disease:
        word_count+=1
        hpo_match=process.extract(word, hpo_list, limit=1)
        if hpo_match[0][1]>90:
            match_count+=1
            print(word, ': ',process.extract(word, hpo_list, limit=1))
print(f'{match_count} hpo matchess were found out of {word_count} total input terms')

# consider getting rid of modifiers from the HPO terminology

#for now i might just go with 100 or 95 cut off, and then work on improving the matching for the model
        
    # immune deficiency not great, hip 55, id 79, hyptonia 100 not bad

short stature :  [('short stature', 100)]
generalized joint laxity reported :  [('generalized joint laxity', 95)]
immune deficiency :  [('immunodeficiency', 91)]
hearing impairment :  [('hearing impairment', 100)]
severe :  [('severe', 100)]
mild short stature :  [('mild short stature', 100)]
arthralgia :  [('arthralgia', 100)]
joint stiffness :  [('joint stiffness', 100)]
knee pain :  [('knee pain', 100)]
coxa vara :  [('coxa vara', 100)]
disproportionate short stature :  [('disproportionate short stature', 100)]
broad bones :  [('broad long bones', 95)]
mild severe short stature :  [('mild short stature', 95)]
brachydactyly :  [('brachydactyly', 100)]
retinal degeneration :  [('retinal degeneration', 100)]
variable intellectual disability :  [('intellectual disability', 95)]
hypotonia :  [('hypotonia', 100)]
coarse facial features :  [('coarse facial features', 100)]
ichthyosis :  [('ichthyosis', 100)]
hepatomegaly :  [('hepatomegaly', 100)]
developmental delay :  [('global developme

In [211]:
for disease in terms_to_compare[1:10]:
    for word in disease:
        print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.partial_ratio, limit=1))
        
        #not this one, look at hip 100

strikingly small secondary ossification centers :  [('large sternal ossification centers', 79)]
joints :  [('abnormality of the radioulnar joints', 100)]
severe bone dysplasia proximal femoral heads :  [('proximal', 100)]
short stature :  [('asymmetric short stature', 100)]
increased lumbar lordosis :  [('increased dlco', 79)]
genua vara :  [('genu varum', 80)]
generalized joint laxity reported :  [('generalized', 100)]
immune deficiency :  [('cellular immunodeficiency', 91)]
facial :  [('abnormal facial artery morphology', 100)]
skeletal abnormalities :  [('multiple skeletal anomalies', 90)]
hearing impairment :  [('adult onset sensorineural hearing impairment', 100)]
intellectual deficit :  [('intellectual disability', 80)]
severe :  [('intellectual disability, severe', 100)]
early-onset dysplasia proximal femurs :  [('onset', 100)]
complete absence secondary ossification centers :  [('fused sternal ossification centers', 79)]
abnormal development femoral necks :  [('abnormal ekg', 8

In [210]:
for disease in terms_to_compare[1:10]:
    for word in disease:
        print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.ratio, limit=2))
        
        # better hip give 55 and gives immunodeficiency for immune deficiency, intellectual disability fairly good, 79

strikingly small secondary ossification centers :  [('large sternal ossification centers', 69), ('fused sternal ossification centers', 67)]
joints :  [('colitis', 62), ('joint crepitus', 60)]
severe bone dysplasia proximal femoral heads :  [('dysplasia of the femoral head', 66), ('aplasia of the femoral head', 59)]
short stature :  [('short stature', 100), ('mild short stature', 84)]
increased lumbar lordosis :  [('increased male libido', 70), ('increased urinary orosomucoid', 70)]
genua vara :  [('genu varum', 80), ('genu valgum', 67)]
generalized joint laxity reported :  [('generalized joint laxity', 84), ('generalized tonic seizure', 66)]
immune deficiency :  [('immunodeficiency', 91), ('complement deficiency', 74)]
facial :  [('acral', 73), ('axial', 73)]
skeletal abnormalities :  [('multiple skeletal anomalies', 73), ('rib segmentation abnormalities', 73)]
hearing impairment :  [('hearing impairment', 100), ('mild hearing impairment', 88)]
intellectual deficit :  [('intellectual d

In [192]:
for disease in terms_to_compare[1:10]:
    for word in disease:
        print(word, ': ',process.extract(word, hpo_list, scorer = fuzz.token_set_ratio, limit=2))
        
        #immune deficiency way off, hip way off and hypotonia

strikingly small secondary ossification center :  [('Ectopic ossification', 75), ('Fused sternal ossification centers', 72)]
joint :  [('Abnormal hip joint morphology', 100), ('Abnormal joint morphology', 100)]
severe bone dysplasia proximal femoral head :  [('Proximal', 100), ('Severe', 100)]
short stature :  [('Asymmetric short stature', 100), ('Childhood onset short-limb short stature', 100)]
increased lumbar lordosis :  [('Increased KCO', 82), ('Increased DLCO', 78)]
genu vara :  [('Genu varum', 84), ('Genu valgum', 70)]
generalized joint laxity reported :  [('Generalized', 100), ('Generalized joint laxity', 100)]
immune deficiency :  [('Adrenocorticotropic hormone deficiency', 74), ('Aldehyde oxidase deficiency', 74)]
facial :  [('Abnormal facial artery morphology', 100), ('Abnormal facial expression', 100)]
skeletal abnormality :  [('Abnormality of skeletal maturation', 100), ('Abnormality of skeletal muscle fiber size', 100)]
hearing impairment :  [('Adult onset sensorineural he

In [193]:
for disease in terms_to_compare[1:10]:
    for word in disease:
        print(word, ': ',process.extract(word, hpo_list, limit=2))
        
        # immune deficiency 91, hip 90, not this one, other stuff a bit off too

strikingly small secondary ossification center :  [('Abnormal bone ossification', 86), ('Abnormal humeral ossification', 86)]
joint :  [('Abnormal hip joint morphology', 90), ('Abnormal joint morphology', 90)]
severe bone dysplasia proximal femoral head :  [('Proximal', 90), ('Severe', 90)]
short stature :  [('Short stature', 100), ('Mild short stature', 95)]
increased lumbar lordosis :  [('Absent spinous processes of lower thoracic and lumbar vertebrae', 86), ('Decreased anterioposterior diameter of lumbar vertebral bodies', 86)]
genu vara :  [('Genu recurvatum', 86), ('Genu varum', 84)]
generalized joint laxity reported :  [('Generalized joint laxity', 95), ('Generalized', 90)]
immune deficiency :  [('Immunodeficiency', 91), ('Abnormal cellular immune system morphology', 86)]
facial :  [('Abnormal facial artery morphology', 90), ('Abnormal facial expression', 90)]
skeletal abnormality :  [('Abnormal fetal skeletal morphology', 86), ('Abnormal skeletal muscle morphology', 86)]
hearing

In [151]:
#display(token_list)
lemmatized
token_list_clean[7899:7935]
len(orpha_tokens)
list(orpha_terms.items())[400:440]

[(168555,
  ['disproportionate short stature',
   'severe femoral neck deformity',
   'metaphyseal abnormality',
   'platyspondyly consisting ovoid vertebral body anterior tongue-like deformity']),
 (1162,
  ['disproportionate short stature',
   'severe femoral neck deformity',
   'metaphyseal abnormality',
   'platyspondyly consisting ovoid vertebral body anterior tongue-like deformity']),
 (168558,
  ['severe',
   'early-onset',
   'salt-wasting adrenal insufficiency',
   'ambiguous/female external genitalia mutation < > cyp11a1 < /i > gene',
   'milder case',
   'delayed onset adrenal gland dysfunction',
   'genitalia phenotype range normal male female individual',
   '46 , xy karyotype',
   'imaging study reveal hypoplastic/absent adrenal gland',
   'biochemical finding',
   'low serum cortisol',
   'mineralocorticoid',
   'androgen',
   'sodium',
   'elevated potassium level']),
 (168563,
  ['partial',
   'complete gonadal dysgenesis',
   'usually manifesting',
   'primary amenorr

In [99]:
orpha_tokens

{3186: ['holoprosencephaly',
  'predominantly radial limb deficiency',
  'heart defects',
  'kidney malformations',
  'absence of gallbladder',
  'variable manifestations',
  'vertebral anomalies',
  'cleft lip/palate',
  'microphthalmia',
  'absent nose',
  'dysplastic ears',
  'hearing loss',
  'colobomas of the iris',
  'retina and/or bifid uvula',
  ''],
 3191: ['the association of short stature',
  'progressive discrete subaortic stenosis',
  'additional variable manifestations',
  'upturned nose',
  'voice',
  'vocal cord abnormalities',
  'obstructive lung disease',
  'inguinal hernia',
  'kyphoscoliosis and',
  'occasionally',
  'epicanthus',
  'strabismus',
  'microphthalmos',
  'widely spaced teeth',
  'there have been no further descriptions in the literature since 1984',
  ''],
 3193: ['the narrowing of the aorta lumen',
  '',
  'not',
  'stenosis of other arteries',
  'this narrowing of the aorta',
  'pulmonary branches may impede blood flow',
  '',
  'in heart murmur',
  

In [None]:
##installation of modules
##!pip install -U pip setuptools wheel
##!pip install spacy
#! pip install transformers
#import spacy
#from spacy.lang.en.stop_words import STOP_WORDS
# start with small pretrained model
#nlp = spacy.load('en_core_web_sm')
#import medspacy

# start with small pretrained model
#nlp = spacy.load('en_core_web_sm')

In [8]:
test=[.']
nlp = medspacy.load()'A rare severe, X-linked, neurodevelopmental disorder characterised by rapid developmental regression in infancy, partial or complete loss of purposeful hand movements, loss of speech, gait abnormalities, and stereotypic hand movements, commonly associated with deceleration of head growth, severe intellectual disability, seizures, and breathing abnormalities. The disorder has a progressive clinical course and \
        may associate various comorbidities including gastrointestinal diseases, scoliosis, and behavioral disorders
print(nlp.pipe_names)
# nlp.get_pipe(test)
doc = nlp(test)
doc

NameError: name 'medspacy' is not defined

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [12]:
from transformers import pipeline
# Load the clinicalBERT tokenizer and model for token classification
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=3)
nerpipeline = pipeline('ner', model=model, tokenizer=tokenizer)
nerpipeline(input_text)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

[{'entity': 'LABEL_2',
  'score': 0.37690538,
  'index': 1,
  'word': 'a',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_0',
  'score': 0.41044927,
  'index': 2,
  'word': 'rare',
  'start': 2,
  'end': 6},
 {'entity': 'LABEL_0',
  'score': 0.4596424,
  'index': 3,
  'word': 'severe',
  'start': 7,
  'end': 13},
 {'entity': 'LABEL_0',
  'score': 0.4858439,
  'index': 4,
  'word': ',',
  'start': 13,
  'end': 14},
 {'entity': 'LABEL_0',
  'score': 0.47528505,
  'index': 5,
  'word': 'x',
  'start': 15,
  'end': 16},
 {'entity': 'LABEL_0',
  'score': 0.3844964,
  'index': 6,
  'word': '-',
  'start': 16,
  'end': 17},
 {'entity': 'LABEL_0',
  'score': 0.46289796,
  'index': 7,
  'word': 'linked',
  'start': 17,
  'end': 23},
 {'entity': 'LABEL_0',
  'score': 0.4469681,
  'index': 8,
  'word': ',',
  'start': 23,
  'end': 24},
 {'entity': 'LABEL_0',
  'score': 0.47196826,
  'index': 9,
  'word': 'ne',
  'start': 25,
  'end': 27},
 {'entity': 'LABEL_0',
  'score': 0.43037152,
  'index': 10

In [10]:
# Define the input text to be processed
input_text = 'A rare severe, X-linked, neurodevelopmental disorder characterised by rapid developmental regression in infancy, partial or complete loss of purposeful hand movements, loss of speech, gait abnormalities, and stereotypic hand movements, commonly associated with deceleration of head growth, severe intellectual disability, seizures, and breathing abnormalities. The disorder has a progressive clinical course and may associate various comorbidities including gastrointestinal diseases, scoliosis, and behavioral disorders'

# Tokenize the input text
tokens = tokenizer.encode_plus(input_text, padding=True, truncation=True, return_tensors="pt")

# Classify the tokens using clinicalBERT
with torch.no_grad():
    outputs = model(tokens['input_ids'], tokens['attention_mask'])

# Get the predicted entity labels from the output
entity_labels = torch.argmax(outputs.logits, dim=2)[0]

# Map the entity labels to the corresponding entities
entities = []
for i, token in enumerate(tokenizer.tokenize(input_text)):
    if entity_labels[i] == 1:
        entities.append(token)

# Print the entities found in the input text
print("Entities found in input text:", entities)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

Entities found in input text: ['a', 'rare', ',', 'x', '-', 'linked', ',', 'ne', '##uro', '##vel', '##op', 'disorder', 'by', 'rapid', 'in', 'partial', 'complete', 'loss', 'of', 'purpose', '##ful', 'hand', 'movements', 'loss', 'of', 'g', 'abnormal', 'and', 'stereo', '##pic', 'hand', 'movements', 'commonly', 'with', 'de', '##eration', 'of', 'head', 'growth', 'severe', 'intellectual', 'disability', 'seizure', '##s', ',', 'and', 'breathing', 'abnormal', 'the', 'disorder', 'a', 'progressive', 'clinical', 'course', 'and', 'may', 'associate', 'various', 'com', '##or', '##bid', '##ities', 'including', 'gas', '##tro', '##int', '##est', '##inal', 'diseases', ',', 's', '##co', '##lio', ',', 'and', 'behavioral', 'disorders']


In [9]:
# the below doesnt work
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

text = ["The patient was diagnosed with acute myeloid leukemia and started on chemotherapy."]

# Tokenize the text and convert to input format expected by the model
inputs = tokenizer(test, return_tensors="pt")

# Run the model on the inputs
outputs = model(**inputs)

# Get the predicted labels for each token
predicted_labels = outputs.logits.argmax(-1).squeeze()

# Get the list of entity labels used by the model
labels = model.config.id2label.values()

# Map the predicted labels to the entity labels
entities = []
for i, label in enumerate(predicted_labels):
    if label != 0:
        entities.append((tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][i]), labels[label]))

# Print the list of entities found in the text
print(entities)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

TypeError: iteration over a 0-d tensor

In [None]:
doc=nlp(doct)
for entity in doc.ents:
    print(entity.label, '|', entity.text)

In [None]:
stop_words=list(STOP_WORDS)

processed_tokens=[token.lemma for token in doc if  token.text not in stop_words]

for token in doc:
    print(token ,'|',token.lemma_)

In [None]:
process

In [None]:
entities
processed_tokens
test

# HPO mapping

In [None]:
import requests
import json
from bs4 import BeautifulSoup as bs

In [None]:
#url=https://hpo.jax.org/api/

# here need to list all the terms and loop through them in order to retrieve the HPO terms:
#example term for testing
clinical_term = 'Loss of developmental milestones'


# Construct the API URL
url = f'https://hpo.jax.org/api/hpo/search?q={clinical_term}&category=terms'

# Send the API request
response = requests.get(url)

# checking connection
##display(response)

# Parse the response JSON
response_data = json.loads(response.text)

In [None]:
response_data['terms'][0]
# Extract the HPO term ID and name from the response

In [None]:
hpo_id = response_data['terms'][0]['name']
hpo_name = response_data['terms'][0]['id']

# Print the results
print(f"HPO ID: {hpo_id}")
print(f"HPO Name: {hpo_name}")