### Importing necessary libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, recall_score, precision_score
import re

### Reading datasets given

In [3]:
old_glucose_df = pd.read_excel('loinc_dataset-v2.xlsx', header=2, sheet_name='glucose in blood')
old_glucose_df.head()

Unnamed: 0,loinc_num,long_common_name,component,system,property,relevance
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc,0
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc,1
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type,1
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc,0
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc,0


In [4]:
old_bilirubin_df = pd.read_excel('loinc_dataset-v2.xlsx', header=2, sheet_name='bilirubin in plasma')
old_bilirubin_df.head()

Unnamed: 0,loinc_num,long_common_name,component,system,property,relevance
0,934-0,Blood product unit ID [#],Blood product unit ID,Dose,Num,0
1,1742-6,Alanine aminotransferase [Enzymatic activity/v...,Alanine aminotransferase,Ser/Plas,CCnc,0
2,20565-8,"Carbon dioxide, total [Moles/volume] in Blood",Carbon dioxide,Bld,SCnc,0
3,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc,0
4,18906-8,Ciprofloxacin [Susceptibility],Ciprofloxacin,Isolate,Susc,0


In [5]:
old_white_cells_df = pd.read_excel('loinc_dataset-v2.xlsx', header=2, sheet_name='White blood cells count')
old_white_cells_df.head()

Unnamed: 0,loinc_num,long_common_name,component,system,property,relevance
0,33870-7,Bilirubin.total [Presence] in Unspecified spec...,Bilirubin,XXX,PrThr,0
1,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc,0
2,14423-8,Bilirubin.total [Mass/volume] in Synovial fluid,Bilirubin,Synv fld,MCnc,0
3,23658-8,Other Antibiotic [Susceptibility],Antibiotic XXX,Isolate,Susc,0
4,19000-9,Vancomycin [Susceptibility],Vancomycin,Isolate,Susc,0


### Reading dataset with all LOINC codes

In [6]:
new_df = pd.read_csv('LoincTableCore.csv')
new_df.head()

  new_df = pd.read_csv('LoincTableCore.csv')


Unnamed: 0,LOINC_NUM,COMPONENT,PROPERTY,TIME_ASPCT,SYSTEM,SCALE_TYP,METHOD_TYP,CLASS,CLASSTYPE,LONG_COMMON_NAME,SHORTNAME,EXTERNAL_COPYRIGHT_NOTICE,STATUS,VersionFirstReleased,VersionLastChanged
0,100000-9,Health informatics pioneer and the father of L...,Hx,Pt,^Patient,Nar,,H&P.HX,2,Health informatics pioneer and the father of L...,Health Info Pioneer+Father of LOINC,,ACTIVE,2.74,2.74
1,100001-7,Health informatics pioneer and cofounder of LOINC,Hx,Pt,^Patient,Nar,,H&P.HX,2,Health informatics pioneer and cofounder of LOINC,Health Info Pioneer+Cofound LOINC,,ACTIVE,2.74,2.74
2,100002-5,Specimen care is maintained,Find,Pt,^Patient,Ord,,SURVEY.PNDS,4,Specimen care is maintained,,,ACTIVE,2.72,2.72
3,100003-3,Team communication is maintained throughout care,Find,Pt,^Patient,Ord,,SURVEY.PNDS,4,Team communication is maintained throughout care,,,ACTIVE,2.72,2.72
4,100004-1,Demonstrates knowledge of the expected psychos...,Find,Pt,^Patient,Ord,,SURVEY.PNDS,4,Demonstrates knowledge of the expected psychos...,,,ACTIVE,2.72,2.72


### Reducing dataset keeping only attributes in first dataset

In [7]:
df_extended = new_df[[col for col in new_df.columns if col.lower() in list(old_glucose_df.columns)]]
for col in df_extended.columns:
    df_extended.rename(columns={col: col.lower()}, inplace=True)
df_extended.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_extended.rename(columns={col: col.lower()}, inplace=True)


Unnamed: 0,loinc_num,component,property,system,long_common_name
0,100000-9,Health informatics pioneer and the father of L...,Hx,^Patient,Health informatics pioneer and the father of L...
1,100001-7,Health informatics pioneer and cofounder of LOINC,Hx,^Patient,Health informatics pioneer and cofounder of LOINC
2,100002-5,Specimen care is maintained,Find,^Patient,Specimen care is maintained
3,100003-3,Team communication is maintained throughout care,Find,^Patient,Team communication is maintained throughout care
4,100004-1,Demonstrates knowledge of the expected psychos...,Find,^Patient,Demonstrates knowledge of the expected psychos...


### Mapping properties abbreviations to meaning

In [8]:
def read_property_dict(filepath: str) -> dict:
    prop_dict = {}
    df = pd.read_csv(filepath, header=0)
    for index, row in df.iterrows():
        prop_dict[row.iloc[1]] = row.iloc[2]
    return prop_dict


### Calling property map function

In [9]:
prop_filepath = 'loinc_property.csv'
property_dict = read_property_dict(prop_filepath)
print(property_dict)

{'CAct': '*Catalytic Activity', 'CCnc': 'Catalytic Concentration', 'CRto': 'Catalytic Ratio', 'CCnt': '*Catalytic Content', 'CFr': '*Catalytic Fraction', 'CFr.DF': 'Decimal Catalytic Fraction', 'CRat': 'Catalytic Rate', 'RelCCnc': 'Relative Catalytic Concentration', 'CSub': 'Catalytic Substance', 'EntCat': '*Entitic Catalytic Activity', 'EntLen': 'Entitic Length', 'EntLogNum': 'Logarithmic Entitic Number', 'EntMass': 'Entitic Mass', 'EntNum': '*Entitic Number', 'EntVol': '*Entitic Volume', 'EntSub': 'Entitic Substance', 'EntSRto': 'Entitic Substance Ratio', 'Mass': 'Mass', 'ArMass': 'Mass/Area', 'MCnc': '*Mass Concentration', 'MCncSq': 'Mass Concentration Squared', 'MCnt': 'Mass Content', 'MDiff': 'Mass Difference', 'MFr': '*Mass Fraction', 'MFr.DF': 'Mass Decimal Fraction', 'MFrDiff': 'Mass Fraction Difference', 'MRat': 'Mass Rate', 'MRto': 'Mass Ratio', 'MSCnc': 'Mass or Substance Concentration', 'RelMCnc': '*Relative Mass Concentration', 'RelMRat': 'Relative Mass Rate', 'ThrMCnc': '

### Mapping system abbreviations to meaning

In [10]:
def read_system_dict(filepath: str) -> dict:
    syst_dict = {}
    df = pd.read_csv(filepath, header=0)
    for index, row in df.iterrows():
        syst_dict[row.iloc[0]] = row.iloc[1]
    return syst_dict

### Calling system map function

In [11]:
syst_filepath = 'loinc_system.csv'
system_dict = read_system_dict(syst_filepath)
print(system_dict)

{'Abscess': 'Abscess', 'Amnio fld': 'Amniotic fluid', 'Anal': 'Anus', 'Asp': 'Aspirate', 'Bil fld': 'Bile fluid', 'BldA': 'Blood arterial', 'BldL': 'Blood bag', 'BldC': 'Blood capillary', 'BldCo': 'Blood – cord', 'BldCV': 'Blood – central venous', 'BldMV': 'Blood – mixed venous', 'BldP': 'Blood – peripheral', 'BldV': 'Blood venous', 'Bld.dot': 'Blood filter paper', 'Body fld': 'Body fluid, unsp', 'Bone': 'Bone', 'Brain': 'Brain', 'Bronchial': 'Bronchial', 'Burn': 'Burn', 'Calculus': 'Calculus (=Stone)', 'Cnl': 'Cannula', 'CTp': 'Catheter tip', 'CSF': 'Cerebral spinal fluid', 'Cvm': 'Cervical mucus', 'Cvx': 'Cervix', 'Col': 'Colostrum', 'Cnjt': 'Conjunctiva', 'Crn': 'Cornea', 'Dentin': 'Dentin', 'Dial fld': 'Dialysis fluid', 'Dose': 'Dose med or substance', 'Drain': 'Drain', 'Duod fld': 'Duodenal fluid', 'Ear': 'Ear', 'Endomet': 'Endometrium', 'Environmental Specimen': 'Environmental Specimen', 'RBC': 'Erythrocytes', 'Eye': 'Eye', 'Exhl gas': 'Exhaled gas (breath)', 'Fibroblasts': 'Fibr

### Updating dataset with meanings instead of abbreviations

In [12]:
def update_df(df, prop_dict, system_dict):
    df_copy = df.copy()
    for index, row in df.iterrows():
        if row['property'] in prop_dict:
            df_copy.at[index, 'property'] = prop_dict[row['property']]
        if row['system'] in system_dict:
            df_copy.at[index, 'system'] = system_dict[row['system']]
    return df_copy


In [13]:
old_bilirubin_df = update_df(old_bilirubin_df, property_dict, system_dict)
old_glucose_df = update_df(old_glucose_df, property_dict, system_dict)
old_white_cells_df = update_df(old_white_cells_df, property_dict, system_dict)

In [14]:
mapped_df = update_df(df_extended, property_dict, system_dict)
mapped_df

Unnamed: 0,loinc_num,component,property,system,long_common_name
0,100000-9,Health informatics pioneer and the father of L...,History,^Patient,Health informatics pioneer and the father of L...
1,100001-7,Health informatics pioneer and cofounder of LOINC,History,^Patient,Health informatics pioneer and cofounder of LOINC
2,100002-5,Specimen care is maintained,Finding,^Patient,Specimen care is maintained
3,100003-3,Team communication is maintained throughout care,Finding,^Patient,Team communication is maintained throughout care
4,100004-1,Demonstrates knowledge of the expected psychos...,Finding,^Patient,Demonstrates knowledge of the expected psychos...
...,...,...,...,...,...
102460,99994-6,"Fluid, electrolyte, and acid-base balances are...",Finding,^Patient,"Fluid, electrolyte, and acid-base balances are..."
102461,99995-3,Respiratory status is maintained at or improve...,Finding,^Patient,Respiratory status is maintained at or improve...
102462,99996-1,Cardiovascular status is maintained at or impr...,Finding,^Patient,Cardiovascular status is maintained at or impr...
102463,99997-9,Demonstrates &or reports adequate pain control,Finding,^Patient,Demonstrates AndOr reports adequate pain control


### Computing relevance score with TF-IDF

In [15]:
def calculate_ranking(df, query: str):
    query = re.sub(r'\bin\b', '', query).strip()
    vocabulary = query.split()
    last_df = df.copy()
    if 'relevance' in df.columns:
        df = df.drop(columns=['relevance'])
    combined_text = [' '.join(row) for row in zip(df['long_common_name'], df['component'])]
    vectorizer = TfidfVectorizer(vocabulary=vocabulary)
    X = vectorizer.fit_transform(combined_text)
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(X, query_vector).flatten()
    last_df['score'] = similarity_scores
    return last_df

In [16]:
def evaluation(df, query, threshold):
    new = calculate_ranking(df, query)
    y_pred = (new['score'] >= threshold).astype(int)
    cm = confusion_matrix(new['relevance'], y_pred)
    precision = precision_score(new['relevance'], y_pred)
    recall = recall_score(new['relevance'], y_pred)
    print("Results for the query:", query)
    print("\nConfusion Matrix:")
    print(cm)
    print("\nPrecision:", precision)
    print("Recall:", recall)

### Checking TF-IDF reliability on provided training dataset

In [17]:
evaluation(old_bilirubin_df, 'bilirubin in plasma', 0.5)

Results for the query: bilirubin in plasma

Confusion Matrix:
[[60  0]
 [ 1  6]]

Precision: 1.0
Recall: 0.8571428571428571


In [18]:
old_bilirubin_df = calculate_ranking(old_bilirubin_df, "bilirubin in plasma")
old_bilirubin_df = old_bilirubin_df.sort_values(by='score', ascending=False)
old_bilirubin_df.to_excel('./training/init_bilirubin_in_plasma.xlsx', index=False)

In [19]:
evaluation(old_glucose_df, 'glucose in blood', 0.1)

Results for the query: glucose in blood

Confusion Matrix:
[[44  0]
 [ 0 23]]

Precision: 1.0
Recall: 1.0


In [20]:
old_glucose_df = calculate_ranking(old_glucose_df, "glucose in blood")
old_glucose_df = old_glucose_df.sort_values(by='score', ascending=False)
old_glucose_df.to_excel('./training/init_glucose_in_blood.xlsx', index=False)

In [23]:
evaluation(old_white_cells_df, 'White blood cells count', 0.1)

Results for the query: White blood cells count

Confusion Matrix:
[[49  4]
 [ 0 14]]

Precision: 0.7777777777777778
Recall: 1.0




In [180]:
old_white_cells_df = calculate_ranking(old_white_cells_df, "white blood cells count")
old_white_cells_df = old_white_cells_df.sort_values(by='score', ascending=False)
old_white_cells_df.to_excel('./training/init_white_blood_cells_count.xlsx', index=False)

### Extending dataset with all LOINC data relevant for each query

In [None]:
bilirubin_df = calculate_ranking(mapped_df, "bilirubin in plasma")
bilirubin_df = bilirubin_df.sort_values(by='score', ascending=False)
bilirubin_df[bilirubin_df['score']>0].to_excel('./training/bilirubin_in_plasma.xlsx', index=False)

In [None]:
glucose_in_blood_df = calculate_ranking(mapped_df, "glucose in blood")
glucose_in_blood_df = glucose_in_blood_df.sort_values(by='score', ascending=False)
glucose_in_blood_df[glucose_in_blood_df['score']>0].to_excel('./training/glucose_in_blood.xlsx', index=False)

In [None]:
white_blood_cells_count_df = calculate_ranking(mapped_df, "white blood cells count")
white_blood_cells_count_df = white_blood_cells_count_df.sort_values(by='score', ascending=False)
white_blood_cells_count_df[white_blood_cells_count_df['score']>0].to_excel('./training/white_blood_cells_count.xlsx', index=False)

In [126]:
breast_cancer_df = calculate_ranking(mapped_df, "breast cancer")
breast_cancer_df = breast_cancer_df.sort_values(by='score', ascending=False)
breast_cancer_df[breast_cancer_df['score']>0].to_excel('./training/breast_cancer.xlsx', index=False)