In [40]:
import pandas as pd
import numpy as np

# open data 

file_path = 'data/Debernardi et al 2020 data.csv'
df = pd.read_csv(file_path)

data = df[['age', 'sex', 'diagnosis', 'stage', 'benign_sample_diagnosis', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A']]

In [41]:
data = data.drop(['stage', 'benign_sample_diagnosis'], axis =1)

from sklearn.preprocessing import OneHotEncoder

# Assuming 'sex' is a categorical column in your DataFrame

encoder = OneHotEncoder(drop='first', sparse_output=False, dtype=int)  # Instantiate the encoder
encoded_sex = encoder.fit_transform(data[['sex']])  # Fit and transform the 'sex' column

data['sex'] = encoded_sex
data.rename(columns={'sex': 'is_male'}, inplace=True)

In [43]:
data

Unnamed: 0,age,is_male,diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,33,0,1,11.7,1.83222,0.893219,52.948840,654.282174,1262.000
1,81,0,1,,0.97266,2.037585,94.467030,209.488250,228.407
2,51,1,1,7.0,0.78039,0.145589,102.366000,461.141000,
3,61,1,1,8.0,0.70122,0.002805,60.579000,142.950000,
4,62,1,1,9.0,0.21489,0.000860,65.540000,41.088000,
...,...,...,...,...,...,...,...,...,...
585,68,1,3,,0.52026,7.058209,156.241000,525.178000,
586,71,0,3,,0.85956,8.341207,16.915000,245.947000,
587,63,1,3,,1.36851,7.674707,289.701000,537.286000,
588,75,0,3,,1.33458,8.206777,205.930000,722.523000,


In [48]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X = data.drop(['diagnosis'], axis = 1)
y = data[['diagnosis']] - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

xgb_classifier = XGBClassifier()

xgb_classifier.fit(X_train, y_train)

In [49]:
y_pred = xgb_classifier.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7542372881355932


In [51]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[32  9  0]
 [10 22  7]
 [ 0  3 35]]


In [52]:
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77        41
           1       0.65      0.56      0.60        39
           2       0.83      0.92      0.88        38

    accuracy                           0.75       118
   macro avg       0.75      0.76      0.75       118
weighted avg       0.75      0.75      0.75       118



In [None]:
# trying with HistGradientBoostingClassifier

In [81]:
from sklearn.ensemble import HistGradientBoostingClassifier

classifier = HistGradientBoostingClassifier()
classifier.fit(X_train, y_train)

In [83]:
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Accuracy: 0.652542372881356
Confusion Matrix:
 [[27  5  9]
 [ 6 17 16]
 [ 1  4 33]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.66      0.72        41
           1       0.65      0.44      0.52        39
           2       0.57      0.87      0.69        38

    accuracy                           0.65       118
   macro avg       0.67      0.65      0.64       118
weighted avg       0.68      0.65      0.64       118



# Now with the means

In [54]:
# Overwrite NaN values in 'plasma_CA19_9' with mean
data['plasma_CA19_9'].fillna(data['plasma_CA19_9'].mean(), inplace=True)

# Overwrite NaN values in 'REG1A' with mean
data['REG1A'].fillna(data['REG1A'].mean(), inplace=True)

In [57]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X = data.drop(['diagnosis'], axis = 1)
y = data[['diagnosis']] - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

xgb_classifier = XGBClassifier()

xgb_classifier.fit(X_train, y_train)

y_pred = xgb_classifier.predict(X_test)

# trying something else here

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


# Create a pipeline with two imputers and a classifier
pipeline = Pipeline([
    ('imputer_mean', SimpleImputer(strategy='mean')),  # Imputer with strategy='mean'
    ('imputer_median', SimpleImputer(strategy='median')),  # Imputer with strategy='median'
    ('standard_scaler', StandardScaler()),  # StandardScaler
    ('min_max_scaler', MinMaxScaler()),
    ('log_reg_classifier', LogisticRegression()),
    ('forest_classifier', RandomForestClassifier()),
    ('svc_classifier', SVC()),
    ('knn_classifier', KNeighborsClassifier()),
    ('gb_classifier', GradientBoostingClassifier())
])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'LogisticRegression()' (type <class 'sklearn.linear_model._logistic.LogisticRegression'>) doesn't

In [65]:
import pandas as pd
import numpy as np

# open data 

file_path = 'data/Debernardi et al 2020 data.csv'
df = pd.read_csv(file_path)

data = df[['age', 'sex', 'diagnosis', 'stage', 'benign_sample_diagnosis', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A']]

In [66]:
data = data.drop(['stage', 'benign_sample_diagnosis'], axis =1)

from sklearn.preprocessing import OneHotEncoder

# Assuming 'sex' is a categorical column in your DataFrame

encoder = OneHotEncoder(drop='first', sparse_output=False, dtype=int)  # Instantiate the encoder
encoded_sex = encoder.fit_transform(data[['sex']])  # Fit and transform the 'sex' column

data['sex'] = encoded_sex
data.rename(columns={'sex': 'is_male'}, inplace=True)

In [78]:
# Overwrite NaN values in 'plasma_CA19_9' with mean
data['plasma_CA19_9'].fillna(data['plasma_CA19_9'].mean(), inplace=True)

# Overwrite NaN values in 'REG1A' with mean
data['REG1A'].fillna(data['REG1A'].mean(), inplace=True)

In [79]:
X = data.drop(['diagnosis'], axis = 1)
y = data[['diagnosis']] - 1

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train and y_test to numpy arrays
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Create a pipeline with imputers, scalers, and classifiers
pipeline = Pipeline([
    ('imputer', SimpleImputer()),  # Imputer with default strategy (mean)
    ('scaler', StandardScaler()),  # StandardScaler
    ('classifier', LogisticRegression())  # Default classifier
])

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'imputer__strategy': ['mean', 'median'],  # Imputer strategies to try
    'scaler': [StandardScaler(), MinMaxScaler()],  # Scalers to try
    'classifier': [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier(), SVC(), KNeighborsClassifier()]  # Classifiers to try
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy)

# View classification report for the best model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Model Accuracy: 0.711864406779661
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.71      0.72        41
           1       0.58      0.54      0.56        39
           2       0.81      0.89      0.85        38

    accuracy                           0.71       118
   macro avg       0.71      0.71      0.71       118
weighted avg       0.71      0.71      0.71       118



In [72]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train and y_test to numpy arrays
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Create a pipeline with imputers, scalers, and classifiers
pipeline = Pipeline([
    ('imputer', SimpleImputer()),  # Imputer with default strategy (mean)
    ('scaler', StandardScaler()),  # StandardScaler
    ('classifier', LogisticRegression())  # Default classifier
])

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'imputer__strategy': ['mean', 'median'],  # Imputer strategies to try
    'scaler': [StandardScaler(), MinMaxScaler()],  # Scalers to try
    'classifier': [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier(), SVC(), KNeighborsClassifier()]  # Classifiers to try
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print out all the combinations that were tried
print("All combinations that were tried:")
results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Mean accuracy: {mean_score:.3f} - Parameters: {params}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nBest Model Accuracy:", accuracy)

# View classification report for the best model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

All combinations that were tried:
Mean accuracy: 0.604 - Parameters: {'classifier': LogisticRegression(), 'imputer__strategy': 'mean', 'scaler': StandardScaler()}
Mean accuracy: 0.551 - Parameters: {'classifier': LogisticRegression(), 'imputer__strategy': 'mean', 'scaler': MinMaxScaler()}
Mean accuracy: 0.631 - Parameters: {'classifier': LogisticRegression(), 'imputer__strategy': 'median', 'scaler': StandardScaler()}
Mean accuracy: 0.544 - Parameters: {'classifier': LogisticRegression(), 'imputer__strategy': 'median', 'scaler': MinMaxScaler()}
Mean accuracy: 0.663 - Parameters: {'classifier': RandomForestClassifier(), 'imputer__strategy': 'mean', 'scaler': StandardScaler()}
Mean accuracy: 0.652 - Parameters: {'classifier': RandomForestClassifier(), 'imputer__strategy': 'mean', 'scaler': MinMaxScaler()}
Mean accuracy: 0.686 - Parameters: {'classifier': RandomForestClassifier(), 'imputer__strategy': 'median', 'scaler': StandardScaler()}
Mean accuracy: 0.688 - Parameters: {'classifier': R

In [77]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train and y_test to numpy arrays
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Create a pipeline with imputers, scalers, and classifiers
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler
    ('classifier', LogisticRegression())  # Default classifier
])

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler()],  # Scalers to try
    'classifier': [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier()]  # Classifiers to try
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print out all the combinations that were tried
print("All combinations that were tried:")
results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Mean accuracy: {mean_score:.3f} - Parameters: {params}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nBest Model Accuracy:", accuracy)

# View classification report for the best model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1207, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 959, in check_array
    _assert_all_finite(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 124, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 173, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 348, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 959, in check_array
    _assert_all_finite(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 124, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 173, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 416, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 959, in check_array
    _assert_all_finite(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 124, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "C:\Users\ellis\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 173, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.948840,654.282174,1262.000
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.467030,209.488250,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366000,461.141000,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579000,142.950000,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.000860,65.540000,41.088000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,S549,Cohort2,BPTB,68,M,3,IV,,,0.52026,7.058209,156.241000,525.178000,
586,S558,Cohort2,BPTB,71,F,3,IV,,,0.85956,8.341207,16.915000,245.947000,
587,S560,Cohort2,BPTB,63,M,3,IV,,,1.36851,7.674707,289.701000,537.286000,
588,S583,Cohort2,BPTB,75,F,3,IV,,,1.33458,8.206777,205.930000,722.523000,
