In [71]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [72]:
# data = pd.read_parquet('/Users/gufran/Desktop/PfsPredictionLungCancer/clinical_data/PDL1_SCORE.parquet', engine='pyarrow')
# data = pd.DataFrame(data["Sauter PD-L1 Score"]).reset_index()
# data.columns = ["dmp_pt_id","Sauter PD-L1 Score"]

# data.dmp_pt_id.unique()

### Reading Dataframes and separating test and train

In [73]:
df = pd.read_csv('/Users/gufran/Desktop/PfsPredictionLungCancer/clinical_data/clinical_data.csv')

# df["dmp_pt_id"] = pd.to_numeric(df["dmp_pt_id"], errors='coerce').astype(int)
df = df[(df['pfs'] != 0) | (df['pfs_censor'] == 1)]
df.shape

(366, 53)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   radiology_accession_number  297 non-null    float64
 1   dmp_pt_id                   332 non-null    object 
 2   main_index                  366 non-null    object 
 3   record_id                   366 non-null    int64  
 4   de_i_id                     247 non-null    float64
 5   albumin                     365 non-null    float64
 6   smoking_status              366 non-null    int64  
 7   pack_years                  366 non-null    object 
 8   dnlr                        364 non-null    float64
 9   pfs                         366 non-null    float64
 10  pfs_censor                  366 non-null    int64  
 11  os_int                      232 non-null    float64
 12  age                         366 non-null    int64  
 13  histo                       366 non

In [75]:
df_splits = pd.read_csv('/Users/gufran/Desktop/PfsPredictionLungCancer/clinical_data/train_test_splits.csv')
df_splits = df_splits.rename(columns={'patient_id': 'id'})

train_ids = df_splits[df_splits.split == 'train']
train_ids = train_ids.rename(columns={'id': 'dmp_pt_id'})

test_ids1 = df_splits[df_splits.split == 'rad_valid']
test_ids1 = test_ids1.rename(columns={'id': 'radiology_accession_number'})
test_ids1["radiology_accession_number"] = pd.to_numeric(test_ids1["radiology_accession_number"], errors='coerce').astype(float)

test_ids2 = df_splits[df_splits.split == 'path_valid']
test_ids2 = test_ids2.rename(columns={'id': 'pdl1_image_id'})
# test_ids2["pdl1_image_id"] = pd.to_numeric(test_ids2["pdl1_image_id"], errors='coerce').astype(float)

In [76]:
df_train = pd.merge(df, train_ids, on='dmp_pt_id', how='inner')
df_train.shape

(247, 54)

In [77]:
df_test1 = pd.merge(df.drop(["dmp_pt_id"], axis=1), test_ids1, on='radiology_accession_number', how='inner')
df_test1.shape

(50, 53)

In [78]:
df_test2 = pd.merge(df.drop(["dmp_pt_id"], axis=1), test_ids2, on='pdl1_image_id', how='inner')
df_test2.shape

(71, 53)

In [79]:
# df = pd.merge(df, data, on='dmp_pt_id', how='inner')

In [80]:
def process_clinical(df, selected_columns=None, correlation_threshold = 0.2, drop_na = True):
    object_float_columns = ["pack_years","halo_tumor_quality"]
    good_object_columns = ["histo","pdl1_tiss_site"]
    
    remove_columns = list(df.select_dtypes(include=['object']).columns)
    if "dmp_pt_id" in remove_columns: remove_columns.append("dmp_pt_id")
    remove_columns.append("record_id")
    remove_columns.append("pfs_censor")
    remove_columns.append("radiology_accession_number")
    remove_columns.append("did_acc")
    remove_columns.append("os_int")
    remove_columns.append("bor")
    remove_columns.append("sex")
    remove_columns.append("deid")
    remove_columns.append("label")
    remove_columns.append("clinical_pdl1_score") #removing because of empty cells in test data
    remove_columns.append("js_pdl1_score") #removing because of empty cells in test data
    
    for c in object_float_columns:
        remove_columns.remove(c)
        
    df = df.drop(columns=remove_columns)
    
    for c in object_float_columns:
        df[c] = pd.to_numeric(df[c], errors='coerce').astype(float)
    df["hist_adeno"] = df["hist_adeno"].astype(int)
    
    if selected_columns is None:
        correlations = df.corrwith(df['pfs'])
        selected_columns = correlations[(correlations >= correlation_threshold) | (correlations <= -correlation_threshold)].index

        selected_columns = list(selected_columns)
        
    for sc in selected_columns:
        # if sc in ["dmp_pt_id", "pfs"]: continue
        print(f"Correlation with {sc}: {df['pfs'].corr(df[sc])}")
    
    df = df[selected_columns]        
    if drop_na: df = df.dropna().reset_index(drop=True)
    
    return df, selected_columns

In [81]:
df_train, selected_columns = process_clinical(df_train, correlation_threshold = 0.2)
df_train

Correlation with albumin: 0.2965218696773043
Correlation with pfs: 1.0


Unnamed: 0,albumin,pfs
0,4.0,5.6
1,3.4,7.5
2,4.4,1.2
3,3.2,5.0
4,2.7,5.5
...,...,...
242,3.5,8.7
243,3.8,6.4
244,3.6,1.2
245,4.1,5.0


In [82]:
df_test1, _ = process_clinical(df_test1, selected_columns=selected_columns)
df_test1.head()

Correlation with albumin: 0.20610220760106246
Correlation with pfs: 1.0


Unnamed: 0,albumin,pfs
0,3.9,26.1
1,3.6,1.8
2,3.8,1.3
3,3.7,5.7
4,3.9,1.7


In [83]:
df_test2, _ = process_clinical(df_test2, selected_columns=selected_columns)
df_test2.head()

Correlation with albumin: 0.21085271678339737
Correlation with pfs: 1.0


Unnamed: 0,albumin,pfs
0,3.4,1.6
1,3.3,7.2
2,3.3,3.8
3,3.6,1.9
4,4.0,14.9


In [84]:
df_test = pd.concat([df_test1, df_test1], axis=0)

In [85]:
df_train.shape, df_test.shape

((247, 2), (100, 2))

### Training and Testing

In [86]:
X_train, y_train = df_train.drop(["pfs"], axis=1), df_train["pfs"]
X_test, y_test = df_test.drop(["pfs"], axis=1), df_test["pfs"]

In [87]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [88]:
svm_regressor = SVR(kernel='linear', C=1.0)
svm_regressor.fit(X_train, y_train)

y_pred = svm_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

pearson_coefficient, _ = pearsonr(y_test, y_pred)
print(f'Pearson Correlation Coefficient: {pearson_coefficient}')

Mean Squared Error: 146.4731052249045
Pearson Correlation Coefficient: 0.2061022076010618


In [89]:
classifiers_reg = {
    'SVM': {
        'name': 'Support Vector Machine',
        'classifier': SVR(),
        'param_grid': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}
    },
    'XGBoost': {
        'name': 'XGBoost',
        'classifier': XGBRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5]}
    },
    'AdaBoost': {
        'name': 'AdaBoost',
        'classifier': AdaBoostRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
    },
    'RandomForest': {
        'name': 'Random Forest',
        'classifier': RandomForestRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    },
    'DecisionTree': {
        'name': 'Decision Tree',
        'classifier': DecisionTreeRegressor(),
        'param_grid': {'max_depth': [None, 10, 20]}
    }
}

In [90]:
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

best_models = {}

for clf_name, clf_info in classifiers_reg.items():
    print(f"Performing GridSearchCV for {clf_info['name']}...")
    
    clf = clf_info['classifier']
    param_grid = clf_info['param_grid']
    
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_models[clf_name] = grid_search.best_estimator_

print()

for clf_name, best_model in best_models.items():
    print(f"Evaluating {clf_name} on test data...")
    
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    pearson_coefficient, _ = pearsonr(y_test, y_pred)
    
    print(f"Mean Squared Error on Test Data: {mse:.4f}")
    print(f"Pearson Correlation Coefficient on Test Data: {pearson_coefficient:.4f}\n")

Performing GridSearchCV for Support Vector Machine...
Performing GridSearchCV for XGBoost...
Performing GridSearchCV for AdaBoost...
Performing GridSearchCV for Random Forest...
Performing GridSearchCV for Decision Tree...

Evaluating SVM on test data...
Mean Squared Error on Test Data: 145.1503
Pearson Correlation Coefficient on Test Data: 0.1802

Evaluating XGBoost on test data...
Mean Squared Error on Test Data: 126.3373
Pearson Correlation Coefficient on Test Data: 0.1888

Evaluating AdaBoost on test data...
Mean Squared Error on Test Data: 130.2567
Pearson Correlation Coefficient on Test Data: 0.1281

Evaluating RandomForest on test data...
Mean Squared Error on Test Data: 126.0702
Pearson Correlation Coefficient on Test Data: 0.1947

Evaluating DecisionTree on test data...
Mean Squared Error on Test Data: 126.3793
Pearson Correlation Coefficient on Test Data: 0.1882

