In [1]:
import sys
sys.path.append('../scripts')

In [65]:
from open_file import open_file
import data_preprocessing
import model_training
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pretrial 1:
## Simplest model performance

In [62]:
file_path = '../data/Debernardi et al 2020 data.csv'
df = pd.read_csv(file_path)

In [66]:
#choose subset without null values and assign X / y

data = df[['age', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'diagnosis']]
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

In [77]:
from sklearn.utils.validation import column_or_1d

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train and y_test to 1D arrays
y_train = column_or_1d(y_train)
y_test = column_or_1d(y_test)

# Step 2: Instantiate the model and fit
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Step 3: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6186440677966102
              precision    recall  f1-score   support

           1       0.67      0.59      0.62        41
           2       0.47      0.44      0.45        39
           3       0.70      0.84      0.76        38

    accuracy                           0.62       118
   macro avg       0.61      0.62      0.61       118
weighted avg       0.61      0.62      0.61       118

[[24 16  1]
 [ 9 17 13]
 [ 3  3 32]]


#### My finished model must have an accuracy of better than 61.8%

# Pretrial 2:
## Change classification task to BINARY

In [79]:
def classify_diagnosis(value):
    if value in [1, 2]:
        return 'no cancer'
    else:
        return 'cancer'

# Apply the classification function to each value in the 'diagnosis' column
y = y['diagnosis'].apply(classify_diagnosis)

In [80]:
y.value_counts()

diagnosis
no cancer    391
cancer       199
Name: count, dtype: int64

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Instantiate the model
clf = LogisticRegression()

# Step 3: Fit the model
clf.fit(X_train, y_train)

# Step 4: Predict on test data
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Other evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7966101694915254
              precision    recall  f1-score   support

      cancer       0.68      0.68      0.68        38
   no cancer       0.85      0.85      0.85        80

    accuracy                           0.80       118
   macro avg       0.77      0.77      0.77       118
weighted avg       0.80      0.80      0.80       118

[[26 12]
 [12 68]]


#### My finished model must have an accuracy of better than 79.7%

# Pretrial 3:
## Model with mean replacement. Increase complexity slightly

In [87]:
data = df[['age', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A', 'diagnosis']]

#filled with means

# Create a copy of the DataFrame
data_copy = data.copy()

# Fill missing values in 'plasma_CA19_9' column with its mean
data_copy['plasma_CA19_9'].fillna(data_copy['plasma_CA19_9'].mean(), inplace=True)

# Fill missing values in 'REG1A' column with its mean
data_copy['REG1A'].fillna(data_copy['REG1A'].mean(), inplace=True)

def classify_diagnosis(value):
    if value in [1, 2]:
        return 'no cancer'
    else:
        return 'cancer'

# Apply the classification function to each value in the 'diagnosis' column
y = y['diagnosis'].apply(classify_diagnosis)

X = data_copy.iloc[:,:-1]

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Instantiate the model
clf = LogisticRegression(max_iter = 1000)

# Step 3: Fit the model
clf.fit(X_train, y_train)

# Step 4: Predict on test data
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Other evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8050847457627118
              precision    recall  f1-score   support

      cancer       0.68      0.74      0.71        38
   no cancer       0.87      0.84      0.85        80

    accuracy                           0.81       118
   macro avg       0.78      0.79      0.78       118
weighted avg       0.81      0.81      0.81       118

[[28 10]
 [13 67]]


#### Simple model has improve drastically from the beginning
#### My finished model should have an accuracy better than 80.5%

# Trial 1: 
# Optimize Missing Value Imputation / Scaling

Accuracy has been chosen as a quick way to determine these early model's performance

In [60]:
df = open_file()
df

Unnamed: 0,age,is_male,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A,diagnosis
0,33,0,11.7,1.83222,0.893219,52.948840,654.282174,1262.000,1
1,81,0,,0.97266,2.037585,94.467030,209.488250,228.407,1
2,51,1,7.0,0.78039,0.145589,102.366000,461.141000,,1
3,61,1,8.0,0.70122,0.002805,60.579000,142.950000,,1
4,62,1,9.0,0.21489,0.000860,65.540000,41.088000,,1
...,...,...,...,...,...,...,...,...,...
585,68,1,,0.52026,7.058209,156.241000,525.178000,,3
586,71,0,,0.85956,8.341207,16.915000,245.947000,,3
587,63,1,,1.36851,7.674707,289.701000,537.286000,,3
588,75,0,,1.33458,8.206777,205.930000,722.523000,,3


In [4]:
#iterate over missing value imputations functions
#iterate over scaling / normalization functions
# using a standard:
    # feature selection
    # data split
    # log reg classifier
    
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso

missing_value_functions = data_preprocessing.missing_value_functions
scaling_normalization_functions = data_preprocessing.scaling_normalization_functions
feature_selection_functions = data_preprocessing.feature_selection_functions

for missing_func, missing_desc in missing_value_functions.items():
    imputed_df = missing_func(df)
    
    X = imputed_df.iloc[:, :-1]
    y = imputed_df.iloc[:, -1:]
    
    def classify_diagnosis(value):
        if value in [1, 2]:
            return '0'
        else:
            return '1'

    # Apply the classification function to each value in the 'diagnosis' column
    y = y['diagnosis'].apply(classify_diagnosis)
    
    for scaling_func, scaling_desc in scaling_normalization_functions.items():
        X_transformed_data = scaling_func(X)
        X_transformed_df = pd.DataFrame(X_transformed_data, columns=imputed_df.columns[:-1])
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=0.2, random_state=42)
        
        logistic_regression = LogisticRegression(max_iter=1000)
        logistic_regression.fit(X_train, y_train)

        y_pred = logistic_regression.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"Missing Value Imputation: {missing_desc}")
        print(f'Scaling/Normalization: {scaling_desc}')

        print(f'Accuracy: {accuracy}')
        print("\n" + "-"*50 + "\n")  # Add a separator between dataframes

Missing Value Imputation: Remove Missing Values
Scaling/Normalization: Min-Max Scaling
Accuracy: 0.8050847457627118

--------------------------------------------------

Missing Value Imputation: Remove Missing Values
Scaling/Normalization: Robust Scaling
Accuracy: 0.8050847457627118

--------------------------------------------------

Missing Value Imputation: Remove Missing Values
Scaling/Normalization: Quantile Transformation
Accuracy: 0.8389830508474576

--------------------------------------------------

Missing Value Imputation: Remove Missing Values
Scaling/Normalization: Log Transformation
Accuracy: 0.8305084745762712

--------------------------------------------------

Missing Value Imputation: Replace with Mean
Scaling/Normalization: Min-Max Scaling
Accuracy: 0.8220338983050848

--------------------------------------------------

Missing Value Imputation: Replace with Mean
Scaling/Normalization: Robust Scaling
Accuracy: 0.8050847457627118

-------------------------------------

## Results:
**Best Missing Value Handler**
* KNN Imputer
* replace with mean also seems robust across different scalers
* remove columns seems unstable with different setups

**Best Scaler / Normalization**
* quantile transformation
* log transformation seems pretty good
* min max scale / robust scale performed worse

# Trial 2:
# Optimize Feature Selection

In [5]:
#iterate over feature selection methods 
# using a standard:
    # knn replacement
    # min max scaler
    # train test split
    # log reg classifier

imputed_df = data_preprocessing.knn_missing(df)

X = imputed_df.iloc[:, :-1]
y = imputed_df.iloc[:, -1:]

y = y['diagnosis'].apply(classify_diagnosis)

In [6]:
X_scaled = data_preprocessing.min_max_scaler(X)
X_df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
# for evaluating and printing results
def train_logistic_regression(X, y, selected_features):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

    # Initialize and train the logistic regression model
    logistic_regression = LogisticRegression(max_iter=1000)
    logistic_regression.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = logistic_regression.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [8]:
# Feature selection using RFE
selected_features_rfe = data_preprocessing.rfe_feature(X_df_scaled, y, n_features_to_select=5)

# Train logistic regression using RFE selected features
accuracy_rfe = train_logistic_regression(X_df_scaled, y, selected_features_rfe)
print(selected_features_rfe)
print(f'RFE Accuracy: {accuracy_rfe}')

Index(['creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A'], dtype='object')
RFE Accuracy: 0.847457627118644


In [9]:
# Feature selection using Lasso
selected_features_lasso = data_preprocessing.lasso_feature(X_df_scaled, y, alpha=0.003)
len(X_df_scaled[selected_features_lasso].columns)

7

In [10]:
# Train logistic regression using Lasso selected features
accuracy_lasso = train_logistic_regression(X_df_scaled, y, selected_features_lasso)
print(f'Lasso Accuracy: {accuracy_lasso}')

Lasso Accuracy: 0.8220338983050848


In [11]:
# Feature selection using PCA
pca = data_preprocessing.pca_feature(X_df_scaled, n_components=7)
X_pca_transformed = pca.transform(X_df_scaled)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca_transformed, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_regression.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f'PCA Accuracy: {accuracy}')

PCA Accuracy: 0.8305084745762712


## Results:
**Best Missing Value Handler**
* RFE

**RFE**
* 84.7%
* 5 features is the best with this setup

**Lasso**
* 82.2%
* (>6) is the best with this setup
* alpha < 0.003

**PCA**
* 83.1%
* PC6 - PC 7 best
* not viable as PCA

# Trial 3:
# Optimize Data Split

In [12]:
#iterate over data split methods
# using a standard:
    # knn replacement
    # min max scaler
    # rfe feature selection of 5
    # log reg classifier


In [13]:
print(X_df_scaled)
print(y)

          age  is_male  plasma_CA19_9  creatinine     LYVE1     REG1B  \
0    0.111111      0.0       0.000377    0.437326  0.037383  0.037715   
1    0.873016      0.0       0.039329    0.225627  0.085284  0.067288   
2    0.396825      1.0       0.000226    0.178273  0.006089  0.072915   
3    0.555556      1.0       0.000258    0.158774  0.000112  0.043150   
4    0.571429      1.0       0.000290    0.038997  0.000031  0.046684   
..        ...      ...            ...         ...       ...       ...   
585  0.666667      1.0       0.003497    0.114206  0.295438  0.111290   
586  0.714286      0.0       0.062715    0.197772  0.349142  0.012048   
587  0.587302      1.0       0.065407    0.323120  0.321244  0.206354   
588  0.777778      0.0       0.015365    0.314763  0.343515  0.146684   
589  0.761905      1.0       0.048000    0.356546  0.343272  0.293424   

         TFF1     REG1A  
0    0.049030  0.095606  
1    0.015698  0.017304  
2    0.034557  0.014277  
3    0.010712  0.01

In [14]:
# Feature selection using RFE
selected_features_rfe = data_preprocessing.rfe_feature(X_df_scaled, y, n_features_to_select=5)

In [15]:
data_splitting_functions = data_preprocessing.data_splitting_functions

for split_func, split_desc in data_splitting_functions.items():           
    # Check if using Train Test Split
    if split_func == data_preprocessing.train_test_datasplit:
       
        X_train, X_test, y_train, y_test = split_func(X_df_scaled[selected_features_rfe], y, test_size = 0.2)

         # Initialize and train the logistic regression model
        logistic_regression = LogisticRegression(max_iter=1000)
        logistic_regression.fit(X_train, y_train)
        y_pred = logistic_regression.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f'Train Test Split: {accuracy}')
        
    # Else using Repeated K Fold
    # n_splits = 5, n_repeats = 2
    else: 
        splits = list(split_func(X_df_scaled[selected_features_rfe], y))
        
        for fold, (train_index, test_index) in enumerate(splits):
            # Split data into train and test sets
            X_train, X_test = X_df_scaled[selected_features_rfe].iloc[train_index], X_df_scaled[selected_features_rfe].iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Initialize and train the logistic regression model
            logistic_regression = LogisticRegression(max_iter=1000)
            logistic_regression.fit(X_train, y_train)
            y_pred = logistic_regression.predict(X_test)
            fold_accuracy = accuracy_score(y_test, y_pred)
            fold_accuracies = []
            fold_accuracies.append(fold_accuracy)

            print(f'Repeated K Fold Fold {fold+1} Accuracy: {fold_accuracy}')

        # Calculate and print average accuracy across all folds
        avg_accuracy = sum(fold_accuracies) / len(fold_accuracies)
        print(f'Average Accuracy across all folds: {avg_accuracy}')
   

Train Test Split: 0.847457627118644
Repeated K Fold Fold 1 Accuracy: 0.847457627118644
Repeated K Fold Fold 2 Accuracy: 0.7542372881355932
Repeated K Fold Fold 3 Accuracy: 0.8050847457627118
Repeated K Fold Fold 4 Accuracy: 0.7288135593220338
Repeated K Fold Fold 5 Accuracy: 0.8220338983050848
Repeated K Fold Fold 6 Accuracy: 0.8389830508474576
Repeated K Fold Fold 7 Accuracy: 0.7711864406779662
Repeated K Fold Fold 8 Accuracy: 0.7372881355932204
Repeated K Fold Fold 9 Accuracy: 0.7711864406779662
Repeated K Fold Fold 10 Accuracy: 0.8050847457627118
Average Accuracy across all folds: 0.8050847457627118


## Results:
**Data Splitting**
* Train test split produced better results

**Train Test Split**
* 84.7%
* much less computational resources

**Repeated K Fold**
* 80.5%
* Will continue to experiment, but may not be worth the effort due to increased computational complexity

# Trial 4:
# Choose best model

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
#iterate over classification models
# using a standard:
    # knn replacement
    # min max scaler
    # rfe feature selection of 5
    # train test split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_df_scaled[selected_features_rfe], y, test_size = 0.2, random_state = 42)

In [39]:
# Iterate through all models
for model_func, (param_grid, model_name) in models_params.items():
    # Perform Grid Search to find the best hyperparameters
    grid_search = GridSearchCV(model_func(X_train, X_test, y_train, y_test), param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best hyperparameters and the corresponding accuracy
    best_params = grid_search.best_params_
    best_accuracy = grid_search.best_score_
    
    # Train the model with the best hyperparameters
    best_model = model_func(X_train, X_test, y_train, y_test, **best_params)
    best_model.fit(X_train, y_train)
    
    # Evaluate the model on the test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the results
    print(f'{model_name}:')
    print(f'Best Hyperparameters: {best_params}')
    print(f'Accuracy: {accuracy}')
    print("\n" + "-"*50 + "\n")  # Add a separator between models


Logistic Regression:
Best Hyperparameters: {'max_iter': 100}
Accuracy: 0.847457627118644

--------------------------------------------------

Random Forest:
Best Hyperparameters: {'max_depth': 5, 'n_estimators': 50}
Accuracy: 0.8305084745762712

--------------------------------------------------

Gradient Boosting:
Best Hyperparameters: {'learning_rate': 0.01, 'n_estimators': 200}
Accuracy: 0.8135593220338984

--------------------------------------------------

Support Vector Machine:
Best Hyperparameters: {'C': 1.0, 'kernel': 'rbf'}
Accuracy: 0.8389830508474576

--------------------------------------------------

K-Nearest Neighbors:
Best Hyperparameters: {'n_neighbors': 3}
Accuracy: 0.788135593220339

--------------------------------------------------



## Results:

**Classification Models**

**Logistic Regression:**
* Best Results

**Random Forest:**
* Decent Results

**Gradient Boosting:**
* Decent Results

**Support Vector Machine:**
* Decent Reults

**K-Nearest Neighbors:**
* Worst Results

**Very general look at the models and will have to be iterated through with other data_preprocessing steps included to find the best outcome**