In [1]:
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor, BaggingRegressor, GradientBoostingRegressor

In [2]:
BASE_DATA = '/Users/gufran/Developer/Projects/AI/PfsForNSCLC/pathology/pathology_features'
os.listdir(BASE_DATA)

['NEW_LUNG_glcm_Autocorrelation_v2_20x_stain1_PDL1.parquet',
 'NEW_LUNG_glcm_Autocorrelation_v2_20x_stain1_PDL1_VALIDATION.parquet',
 'LUNG_PATHOLOGY_PDL1_GLCM_V3_VALIDATION.parquet',
 'LUNG_PATHOLOGY_PDL1_GLCM_V3.parquet']

In [4]:
df_st = pd.read_parquet(os.path.join(BASE_DATA, "NEW_LUNG_glcm_Autocorrelation_v2_20x_stain1_PDL1.parquet"), engine='pyarrow').reset_index()
df_sv = pd.read_parquet(os.path.join(BASE_DATA, "NEW_LUNG_glcm_Autocorrelation_v2_20x_stain1_PDL1_VALIDATION.parquet"), engine='pyarrow').reset_index()
df_st = df_st.rename(columns={'main_index': 'dmp_pt_id'})
df_sv = df_sv.rename(columns={'main_index': 'pdl1_image_id'})

df_st.shape, df_sv.shape

((105, 19), (52, 19))

In [5]:
def process_clinical(df, selected_columns=None, correlation_threshold = 0.2, drop_na = True):
    object_float_columns = ["pack_years","halo_tumor_quality"]
    good_object_columns = ["histo","pdl1_tiss_site"]
    
    remove_columns = list(df.select_dtypes(include=['object']).columns)
    if "dmp_pt_id" in remove_columns: remove_columns.remove("dmp_pt_id")
    if "pdl1_image_id" in remove_columns: remove_columns.remove("pdl1_image_id")
    
    remove_columns.append("record_id")
    remove_columns.append("pfs_censor")
    remove_columns.append("radiology_accession_number")
    remove_columns.append("did_acc")
    remove_columns.append("os_int")
    remove_columns.append("bor")
    remove_columns.append("sex")
    remove_columns.append("deid")
    remove_columns.append("label")
    remove_columns.append("clinical_pdl1_score") #removing because of empty cells in test data
    remove_columns.append("js_pdl1_score") #removing because of empty cells in test data
    
    for c in object_float_columns:
        remove_columns.remove(c)
        
    df = df.drop(columns=remove_columns)
    
    for c in object_float_columns:
        df[c] = pd.to_numeric(df[c], errors='coerce').astype(float)
    df["hist_adeno"] = df["hist_adeno"].astype(int)
    
    if selected_columns is None:
        correlations = df.drop(["dmp_pt_id" if "dmp_pt_id" in df.columns else "pdl1_image_id"], axis=1).iloc[:, :-1].corrwith(df['pfs'])
        selected_columns = correlations[(correlations >= correlation_threshold) | (correlations <= -correlation_threshold)].index
        selected_columns = list(selected_columns)
        if "dmp_pt_id" in df.columns: selected_columns.append("dmp_pt_id")
        if "pdl1_image_id" in df.columns: selected_columns.append("pdl1_image_id")
    
    if "dmp_pt_id" not in df.columns: 
        selected_columns.remove("dmp_pt_id")
        selected_columns.append("pdl1_image_id")
        
    for sc in selected_columns:
        if sc in ["dmp_pt_id", "pdl1_image_id","pfs"]: continue
        print(f"Correlation with {sc}: {df['pfs'].corr(df[sc])}")
    
    df = df[selected_columns]
    if drop_na: df = df.dropna().reset_index(drop=True)
    
    return df, selected_columns

In [15]:
df_clinical = pd.read_csv('/Users/gufran/Developer/Projects/AI/PfsForNSCLC/clinical_data/clinical_data.csv')
df_clinical_tr = df_clinical.drop(["pdl1_image_id"], axis=1)
df_clinical_te = df_clinical.drop(["dmp_pt_id"], axis=1)

df_clinical_tr, selected_columns = process_clinical(df_clinical_tr, correlation_threshold = 0.11)
df_clinical_te, _ = process_clinical(df_clinical_te, correlation_threshold = 0.11, selected_columns=selected_columns)

Correlation with albumin: 0.26071203044553243
Correlation with dnlr: -0.11717087983081641
Correlation with ecog: -0.1665222793734818
Correlation with albumin: 0.26071203044553243
Correlation with dnlr: -0.11717087983081641
Correlation with ecog: -0.1665222793734818


In [6]:
df_st = pd.merge(df_st, df_clinical_tr, on='dmp_pt_id', how='inner')
df_sv = pd.merge(df_sv, df_clinical_te, on='pdl1_image_id', how='inner')

df_st.shape, df_sv.shape

((105, 23), (50, 23))

### Take columns with significant correlation only

In [7]:
for sc in selected_columns:
    if sc in ["dmp_pt_id", "pfs","pdl1_image_id"]: continue
    print(f"Correlation with {sc}: {df_st['pfs'].corr(df_st[sc])}")

Correlation with albumin: 0.2677753499725636
Correlation with dnlr: -0.09873474547928056
Correlation with ecog: -0.21379495017420738


In [8]:
correlation_threshold = 0.2
correlations = df_st.drop("dmp_pt_id", axis=1).corrwith(df_st['pfs'])
selected_columns = correlations[(correlations >= correlation_threshold) | (correlations <= -correlation_threshold)].index

selected_columns = list(selected_columns)
selected_columns

['pixel_original_glcm_Autocorrelation_scale_None_channel_1_lognorm_fit_p0',
 'pixel_original_glcm_None_scale_None_channel_1_skewness',
 'pixel_original_glcm_None_scale_None_channel_1_lognorm_fit_p0',
 'albumin',
 'pfs',
 'ecog']

### These are the selected features

In [9]:
for sc in selected_columns:
    if sc in ["dmp_pt_id", "pfs"]: continue
    print(f"Correlation with {sc}: {df_st['pfs'].corr(df_st[sc])}")

Correlation with pixel_original_glcm_Autocorrelation_scale_None_channel_1_lognorm_fit_p0: 0.22536458161216197
Correlation with pixel_original_glcm_None_scale_None_channel_1_skewness: -0.21465594696499923
Correlation with pixel_original_glcm_None_scale_None_channel_1_lognorm_fit_p0: 0.21478831109583116
Correlation with albumin: 0.2677753499725636
Correlation with ecog: -0.21379495017420738


In [10]:
df_st = df_st[selected_columns + ["dmp_pt_id"]]
df_st = df_st.dropna().reset_index(drop=True)

df_sv = df_sv[selected_columns + ["pdl1_image_id"]]
df_sv = df_sv.dropna().reset_index(drop=True)

In [11]:
X_train, y_train = df_st.drop(["pfs", "dmp_pt_id"], axis=1), df_st["pfs"]
X_test, y_test = df_sv.drop(["pfs", "pdl1_image_id"], axis=1), df_sv["pfs"]

In [12]:
X_train.shape, X_test.shape

((105, 5), (50, 5))

### Modeling

In [13]:
# classifiers_reg = {
#     'SVM': {
#         'name': 'Support Vector Machine',
#         'classifier': SVR(),
#         'param_grid': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}
#     },
#     'XGBoost': {
#         'name': 'XGBoost',
#         'classifier': XGBRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5]}
#     },
#     'AdaBoost': {
#         'name': 'AdaBoost',
#         'classifier': AdaBoostRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
#     },
#     'RandomForest': {
#         'name': 'Random Forest',
#         'classifier': RandomForestRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
#     },
#     'DecisionTree': {
#         'name': 'Decision Tree',
#         'classifier': DecisionTreeRegressor(),
#         'param_grid': {'max_depth': [None, 10, 20]}
#     }
# }

# classifiers_reg = {
#     'SVM': {
#         'name': 'Support Vector Machine',
#         'classifier': SVR(),
#         'param_grid': {'C': [0.1, 1.0, 5.0, 10.0], 'kernel': ['linear', 'rbf']}
#     },
#     'XGBoost': {
#         'name': 'XGBoost',
#         'classifier': XGBRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5]}
#     },
#     'AdaBoost': {
#         'name': 'AdaBoost',
#         'classifier': AdaBoostRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
#     },
#     'RandomForest': {
#         'name': 'Random Forest',
#         'classifier': RandomForestRegressor(),
#         'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
#     },
#     'DecisionTree': {
#         'name': 'Decision Tree',
#         'classifier': DecisionTreeRegressor(),
#         'param_grid': {'max_depth': [None, 10, 20]}
#     }
# }

from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor

classifiers_reg = {
    'Linear Regression': {
        'name': 'Linear Regression',
        'classifier': LinearRegression(),
        'param_grid': {}
    },
    'Ridge Regression': {
        'name': 'Ridge Regression',
        'classifier': Ridge(),
        'param_grid': {'alpha': [0.01, 0.1, 1.0, 10.0]}
    },
    'Lasso Regression': {
        'name': 'Lasso Regression',
        'classifier': Lasso(),
        'param_grid': {'alpha': [0.01, 0.1, 1.0, 10.0]}
    },
    'ElasticNet': {
        'name': 'ElasticNet',
        'classifier': ElasticNet(),
        'param_grid': {'alpha': [0.01, 0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]}
    },
    'K-Nearest Neighbors': {
        'name': 'K-Nearest Neighbors',
        'classifier': KNeighborsRegressor(),
        'param_grid': {'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance']}
    },
    'Gradient Boosting': {
        'name': 'Gradient Boosting',
        'classifier': GradientBoostingRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5], 'max_depth': [3, 4, 5]}
    },
    'Huber Regressor': {
        'name': 'Huber Regressor',
        'classifier': HuberRegressor(),
        'param_grid': {'epsilon': [1.1, 1.2, 1.3], 'alpha': [0.0001, 0.001, 0.01]}
    },
    'SGD Regressor': {
        'name': 'Stochastic Gradient Descent Regressor',
        'classifier': SGDRegressor(),
        'param_grid': {'alpha': [0.0001, 0.001, 0.01], 'penalty': ['l1', 'l2', 'elasticnet']}
    },
    'SVM': {
        'name': 'Support Vector Machine',
        'classifier': SVR(),
        'param_grid': {'C': [0.1, 1.0, 5.0, 10.0], 'kernel': ['linear', 'rbf']}
    },
    'XGBoost': {
        'name': 'XGBoost',
        'classifier': XGBRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [3, 4, 5]}
    },
    'AdaBoost': {
        'name': 'AdaBoost',
        'classifier': AdaBoostRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
    },
    'RandomForest': {
        'name': 'Random Forest',
        'classifier': RandomForestRegressor(),
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    },
    'DecisionTree': {
        'name': 'Decision Tree',
        'classifier': DecisionTreeRegressor(),
        'param_grid': {'max_depth': [None, 10, 20]}
    }
}


In [14]:
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

best_models = {}

for clf_name, clf_info in classifiers_reg.items():
    print(f"Performing GridSearchCV for {clf_info['name']}...")
    
    clf = clf_info['classifier']
    param_grid = clf_info['param_grid']
    
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_models[clf_name] = grid_search.best_estimator_

print()

for clf_name, best_model in best_models.items():
    print(f"Evaluating {clf_name} on test data...")
    
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    pearson_coefficient, _ = pearsonr(y_test, y_pred)
    
    print(f"Mean Squared Error on Test Data: {mse:.4f}")
    print(f"Pearson Correlation Coefficient on Test Data: {pearson_coefficient:.4f}\n")

Performing GridSearchCV for Linear Regression...
Performing GridSearchCV for Ridge Regression...
Performing GridSearchCV for Lasso Regression...
Performing GridSearchCV for ElasticNet...
Performing GridSearchCV for K-Nearest Neighbors...
Performing GridSearchCV for Gradient Boosting...
Performing GridSearchCV for Huber Regressor...
Performing GridSearchCV for Stochastic Gradient Descent Regressor...
Performing GridSearchCV for Support Vector Machine...
Performing GridSearchCV for XGBoost...
Performing GridSearchCV for AdaBoost...
Performing GridSearchCV for Random Forest...
Performing GridSearchCV for Decision Tree...

Evaluating Linear Regression on test data...
Mean Squared Error on Test Data: 55.1353
Pearson Correlation Coefficient on Test Data: 0.4188

Evaluating Ridge Regression on test data...
Mean Squared Error on Test Data: 44.8238
Pearson Correlation Coefficient on Test Data: 0.4224

Evaluating Lasso Regression on test data...
Mean Squared Error on Test Data: 54.1430
Pearson C

In [18]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators=list(best_models.items()))
voting_reg.fit(X_train, y_train)

y_pred_voting = voting_reg.predict(X_test)
mse_voting = mean_squared_error(y_test, y_pred_voting)
pearson_coefficient_voting, _ = pearsonr(y_test, y_pred_voting)

print(f"Mean Squared Error on Test Data (Voting): {mse_voting:.4f}")
print(f"Pearson Correlation Coefficient on Test Data (Voting): {pearson_coefficient_voting:.4f}\n")

Mean Squared Error on Test Data (Voting): 35.3262
Pearson Correlation Coefficient on Test Data (Voting): 0.4615



### Best score for `clinical + pathology` features

In [26]:
from sklearn.ensemble import BaggingRegressor

bagging_reg = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=10, random_state=42)  # You can customize parameters
bagging_reg.fit(X_train, y_train)

y_pred_bagging = bagging_reg.predict(X_test)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
pearson_coefficient_bagging, _ = pearsonr(y_test, y_pred_bagging)

print(f"Mean Squared Error on Test Data (Bagging): {mse_bagging:.4f}")
print(f"Pearson Correlation Coefficient on Test Data (Bagging): {pearson_coefficient_bagging:.4f}\n")

Mean Squared Error on Test Data (Bagging): 32.8512
Pearson Correlation Coefficient on Test Data (Bagging): 0.4950





In [17]:
# df_st.to_csv("/Users/gufran/Desktop/PfsPredictionLungCancer/data/train_path_clin_t2.csv", index=False)
# df_sv.to_csv("/Users/gufran/Desktop/PfsPredictionLungCancer/data/val_path_clin_t2.csv", index=False)