In [22]:
import pandas as pd
import numpy as np

In [23]:
%pwd

'/home/anushka/FYP/pyHGT-implementation/data'

In [24]:
# import data

patient_test = pd.read_csv('patient-test.csv', encoding='latin1')
test_map = pd.read_csv('test-disease-organ.csv', encoding='latin1')

print(patient_test.head())
print(test_map.head())

   patient_id      report_date                  test_name  test_value
0      139760    9/2/2024 6:13          Blood Urea Result        66.0
1      139760    9/2/2024 6:14               ASOT  Result       200.0
2      139760    9/2/2024 6:15                 ALP Result       168.0
3      200041  10/7/2024 20:59     Fasting Plasma Glucose        83.0
4      200041  10/7/2024 20:59  1 Hr After Plasma Glucose        83.0
                       test_name     min     max          organ organ.1  \
0                            TSH    0.40    4.00        thyroid     NaN   
1  Fasting Venous Plasma Glucose  100.00  126.00       pancreas     NaN   
2          Lymphocytes#  % Value   20.00   40.00  immune system     NaN   
3          RDW-SD Absolute Value   39.00   46.00          blood     NaN   
4             RBC Absolute Value    4.11    5.51          blood     NaN   

             disease         disease.1 disease.2  
0  thyroid disorders               NaN       NaN  
1  diabetes mellitus       

In [25]:
df = patient_test.merge(test_map, on='test_name', how='left')
df.head()

Unnamed: 0,patient_id,report_date,test_name,test_value,min,max,organ,organ.1,disease,disease.1,disease.2
0,139760,9/2/2024 6:13,Blood Urea Result,66.0,7.0,20.0,kidney,liver,chronic kidney disease,,
1,139760,9/2/2024 6:14,ASOT Result,200.0,0.0,200.0,Immune system,,infections,,
2,139760,9/2/2024 6:15,ALP Result,168.0,44.0,147.0,liver,,biliary obstruction,,
3,200041,10/7/2024 20:59,Fasting Plasma Glucose,83.0,70.0,99.0,blood,,diabetes mellitus,,
4,200041,10/7/2024 20:59,1 Hr After Plasma Glucose,83.0,0.0,180.0,blood,,diabetes mellitus,,


In [26]:
#Test scale bias remove

def abnormality_score(value, low, high):
    mid = (low + high) / 2
    scale = (high - low) / 2

    if scale == 0:
        return 0.0  # No abnormality if low and high are the same
    
    z = (value - mid) / scale
    return np.clip(abs(z), 0, 3)


df["abnoramlity"] = df.apply(
    lambda r: abnormality_score(r["test_value"], r["min"], r["max"]),
    axis=1
)

df[["patient_id", "test_name", "abnoramlity"]].head()

Unnamed: 0,patient_id,test_name,abnoramlity
0,139760,Blood Urea Result,3.0
1,139760,ASOT Result,1.0
2,139760,ALP Result,1.407767
3,200041,Fasting Plasma Glucose,0.103448
4,200041,1 Hr After Plasma Glucose,0.077778


In [27]:
#Aggregate abnormality scores per patient

disease_scores = (
    df.groupby(["patient_id", "disease"])
    .agg(
        disease_scores=("abnoramlity", "mean"),
        test_count=("abnoramlity", "count")
    )
    .reset_index()
)

print(disease_scores.shape[0])
disease_scores.head()


79138


Unnamed: 0,patient_id,disease,disease_scores,test_count
0,139760,biliary obstruction,1.407767,1
1,139760,chronic kidney disease,3.0,1
2,139760,infections,1.0,1
3,200041,diabetes mellitus,0.622812,14
4,201519,diabetes mellitus,0.242857,1


In [28]:
#Mini evidence filtering

MIN_TESTS = 1

disease_scores = disease_scores[disease_scores["test_count"] >= MIN_TESTS]

disease_scores.head()
num_rows = disease_scores.shape[0]
print(f"Number of rows after filtering: {num_rows}")

Number of rows after filtering: 79138


In [29]:
#disesse score normalization

disease_scores["probability"] = (
    disease_scores
    .groupby("patient_id")["disease_scores"]
    .transform(lambda x: x / (x.sum() + 1e-8))
)

disease_scores.head()

Unnamed: 0,patient_id,disease,disease_scores,test_count,probability
0,139760,biliary obstruction,1.407767,1,0.260323
1,139760,chronic kidney disease,3.0,1,0.554758
2,139760,infections,1.0,1,0.184919
3,200041,diabetes mellitus,0.622812,14,1.0
4,201519,diabetes mellitus,0.242857,1,1.0


In [30]:
TOP_K = 3

top_disease = (
    disease_scores
    .sort_values(["patient_id", "probability"], ascending=[True, False])
    .groupby("patient_id")
    .head(TOP_K)
)

top_disease.head()

Unnamed: 0,patient_id,disease,disease_scores,test_count,probability
1,139760,chronic kidney disease,3.0,1,0.554758
0,139760,biliary obstruction,1.407767,1,0.260323
2,139760,infections,1.0,1,0.184919
3,200041,diabetes mellitus,0.622812,14,1.0
4,201519,diabetes mellitus,0.242857,1,1.0


In [31]:
def prune_dominance(group, ratio=0.25):
    max_p = group["probability"].max()
    return group[group["probability"] >= max_p * ratio]

top_disease = (
    top_disease
    .groupby("patient_id", group_keys=False)
    .apply(prune_dominance)
)


  top_disease


In [32]:
patient_disease = top_disease[
    ["patient_id", "disease", "probability"]
]

patient_disease.to_csv("patient-disease.csv", index=False)

print("Saved patient-disease.csv")

Saved patient-disease.csv
