In [10]:
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

In [11]:
# load the patient data
data = pd.read_csv('/Users/julianlink/Documents/Uni/3. Semester/ADLM/adlm-lung-cancer/data/dataset_all.csv')
data.head()

Unnamed: 0,patient_id,timepoint,original_image,nodule_path,age,educat,ethnic,gender,race,diagcopd,height,weight,smokeage,pkyr,smokeday,smokeyr,cigsmok,lung_cancer
0,101706,T0,/local_ssd/practical_wise24/lung_cancer/prasan...,/local_ssd/practical_wise24/lung_cancer/adlm-l...,70,5,2,2,1,0.0,60.0,127.0,40.0,36.0,40,18,0,0
1,101363,T0,/local_ssd/practical_wise24/lung_cancer/prasan...,/local_ssd/practical_wise24/lung_cancer/adlm-l...,71,7,2,1,1,0.0,72.0,240.0,14.0,52.0,20,52,0,0
2,101363,T1,/local_ssd/practical_wise24/lung_cancer/prasan...,/local_ssd/practical_wise24/lung_cancer/adlm-l...,71,7,2,1,1,0.0,72.0,240.0,14.0,52.0,20,52,0,0
3,101363,T2,/local_ssd/practical_wise24/lung_cancer/prasan...,/local_ssd/practical_wise24/lung_cancer/adlm-l...,71,7,2,1,1,0.0,72.0,240.0,14.0,52.0,20,52,0,0
4,102604,T2,/local_ssd/practical_wise24/lung_cancer/prasan...,/local_ssd/practical_wise24/lung_cancer/adlm-l...,69,2,2,1,1,0.0,70.0,300.0,17.0,50.0,20,50,0,0


In [13]:
# load the embeddings
# embeddings are generated from the following template for each patient:
#
# template = f"The patient is {age} years old, has {educat} years of education,
# belongs to ethnic group {ethnic}, and has a height of {height} inches and weight of {weight} pounds.
#The patient has a history of {pkyr} package years of smoking and has smoked for {smokeyr} years. 
# The patient started smoking at {smokeage} years and smokes an average of {smokeday} cigarettes per day. 
# At the time of the trial the patient was {cigsmok} smoking."

with open('embeddings_no_gender.json', 'r') as f:
    data_j = json.load(f)
    print(len(data_j))
    print(len(data_j[0]))

df_embedding = pd.DataFrame(data_j)

# add the lung cancer label to the embeddings
df_embedding['lung_cancer'] = data['lung_cancer']
df_embedding['patient_id'] = data['patient_id']

df_embedding.drop_duplicates(subset='patient_id', keep='first', inplace=True)

print(df_embedding.shape)

df_embedding.head()

5754
1536
(2393, 1538)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1528,1529,1530,1531,1532,1533,1534,1535,lung_cancer,patient_id
0,0.03218,-0.034982,0.027041,0.041289,-0.012346,0.014987,0.015065,-0.005213,0.010127,-0.012872,...,-0.013008,-0.004837,0.016375,0.003396,0.000865,-0.02529,-0.038719,-0.018438,0,101706
1,0.033155,-0.040217,0.029235,0.027911,-0.017409,0.009548,0.013722,-0.007627,0.00357,-0.011606,...,-0.008029,0.004596,0.007932,0.004368,-0.003177,-0.029962,-0.04271,-0.022718,0,101363
4,0.0336,-0.031577,0.030748,0.037281,-0.015361,0.006877,0.007914,-0.007616,0.005843,-0.011355,...,-0.017098,0.004365,0.01212,0.001159,0.004456,-0.020183,-0.0378,-0.017928,0,102604
7,0.037863,-0.035208,0.031747,0.039502,-0.0229,0.015613,0.015262,-0.003705,0.010337,-0.015145,...,-0.012335,-0.001427,0.011704,-0.0043,0.000466,-0.029327,-0.041115,-0.018892,0,100945
10,0.042185,-0.038375,0.03696,0.028981,-0.013474,0.013808,0.012818,-0.015173,-0.004404,-0.017373,...,-0.015211,0.001742,0.013345,-0.00076,0.00601,-0.022971,-0.036805,-0.010565,0,100002


In [18]:
# Separate the features and the target
X = df_embedding.drop(['lung_cancer'], axis=1)
y = df_embedding['lung_cancer']

X.columns = X.columns.astype(str)

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=0.2)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Evaluate the classifier
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[498  22]
 [172  26]]
              precision    recall  f1-score   support

           0       0.74      0.96      0.84       520
           1       0.54      0.13      0.21       198

    accuracy                           0.73       718
   macro avg       0.64      0.54      0.52       718
weighted avg       0.69      0.73      0.66       718

