In [1]:
#deployment model without use of scripts.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
#open file and clean
file_path = '../data/Debernardi et al 2020 data.csv'
data = pd.read_csv(file_path)
encoder = OneHotEncoder(drop='first', sparse_output=False, dtype=int)  
encoded_sex = encoder.fit_transform(data[['sex']])
data['is_male'] = encoded_sex[:, 0]  # Access elements directly
data.drop(columns=['sex', 'sample_id', 'patient_cohort', 'sample_origin', 'stage', 'benign_sample_diagnosis'], inplace=True)
col_order = ['age', 'is_male', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A', 'diagnosis']
df = data[col_order]
def classify_diagnosis(value):
    if value in [1, 2]:
        return '0'
    else:
        return '1'
    df['diagnosis'] = df['diagnosis'].apply(classify_diagnosis)
df['diagnosis'] = df['diagnosis'].apply(classify_diagnosis)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [3]:
#replace null with mean
missing_value_columns = ['plasma_CA19_9', 'REG1A']
def mean_missing(data):
    df_copy = data.copy()
    for column in missing_value_columns:
        df_copy[column].fillna(df[column].mean(), inplace=True)
    return df_copy
X_imputed_df = mean_missing(X)

In [4]:
#use robust scaling
def robust_scaler(data):
    scaler = RobustScaler()
    transformed_data = scaler.fit_transform(data)
    return transformed_data
X_scaled = robust_scaler(X_imputed_df)
X_scaled_df = pd.DataFrame(X_scaled, columns = X_imputed_df.columns)

In [5]:
#use rfe feature selection
def rfe_feature(X, y, n_features_to_select=5):
    estimator = Lasso()
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector = selector.fit(X, y)
    selected_features = X.columns[selector.support_]
    return selected_features
selected_features = rfe_feature(X_scaled_df, y, n_features_to_select = 6)

In [6]:
#data split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df[selected_features], y, test_size=0.2, random_state=42)

In [11]:
#hyperparameters
param_grid = {
    'n_estimators': [50],
    'learning_rate': [0.18],
    'max_depth': [3],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [None],
    'subsample': [0.9],
    'loss': ['exponential']
}

In [12]:
#GBM model and Grid Search
gbm_model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gbm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Use the best model from Grid Search
best_gbm_model = grid_search.best_estimator_

In [16]:
from joblib import dump
dump(best_gbm_model, 'best_gbm_model.joblib')

['best_gbm_model.joblib']