In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
import os
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, DetCurveDisplay, roc_auc_score
from joblib import dump, load

In [2]:
k_mers = ["2","2","4","4","6","6"]
checks = [True,False,True,False,True,False]
colors = ["b","g","r","c","m","y"]
model_probs = []

In [3]:
results = []
model_probs = []
for i in range(len(k_mers)):

    #os.chdir(os.pardir)
    Meth = pd.read_csv("processed_data\combined_data\with_background\combined_" + k_mers[i] + "mers_meth_with_background.tsv", sep="\t")
    Unmeth = pd.read_csv("processed_data\combined_data\with_background\combined_" + k_mers[i] + "mers_unmeth_with_background.tsv", sep="\t")
    Meth = Meth.drop(306)
    Unmeth = Unmeth.drop(306)
    Combined = [Meth,Unmeth]
    Healthy_Meth = Meth.loc[Meth["cancer"]=="Healthy"]
    Healthy_Unmeth = Unmeth.loc[Unmeth["cancer"]=="Healthy"]
    Cancer_Meth = Meth.loc[Meth["cancer"]!="Healthy"]
    Cancer_Unmeth = Unmeth.loc[Unmeth["cancer"]=="Healthy"]
    Data0 = Healthy_Meth
    Data1 = Healthy_Unmeth
    Combined = [Healthy_Meth,Healthy_Unmeth,Cancer_Meth,Cancer_Unmeth]
    X = pd.concat(Combined)
    X = X.iloc [:, :-1]
    dim_reduction = PCA()
    y = [0] * (Healthy_Meth.shape[0]+Healthy_Unmeth.shape[0]) + [1] * (Cancer_Meth.shape[0] + Cancer_Unmeth.shape[0])
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    X_train = dim_reduction.fit_transform(X_train)
    X_test = dim_reduction.transform(X_test)

    # Create a pipeline with standardization and logistic regression with L1 regularization
    pipe = make_pipeline(StandardScaler(),LogisticRegression(penalty='l1', solver='liblinear',max_iter=1000))
    # Define hyperparameters grid for GridSearchCV
    param_grid = {
        'logisticregression__C': np.logspace(-3, 1, 100)  # Values for regularization parameter C
    }

    # Create GridSearchCV object
    grid_search = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1)
    # Fit the model using GridSearchCV
    grid_search.fit(X_train, y_train)



    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    train_accuracy = best_model.score(X_train, y_train)
    test_accuracy = best_model.score(X_test, y_test)
    num_selected_variables = np.sum(best_model.named_steps['logisticregression'].coef_ != 0)

    results.append([test_accuracy, grid_search.best_params_["logisticregression__C"], num_selected_variables,[k_mers[i],checks[i]]])


    plt.scatter(grid_search.cv_results_['param_logisticregression__C'], grid_search.cv_results_['mean_test_score'], color = colors[i])
    plt.plot(grid_search.cv_results_['param_logisticregression__C'], grid_search.cv_results_['mean_test_score'], color = colors [i])
    plt.xlabel("C")
    plt.ylabel("Accuracy")
    plt.savefig("Cross_Val " + k_mers[i] + "-mer, PCA = " + str(checks[i]))
    plt.close()
    
   
    dump(best_model,"Straight_Cross_Val_" + k_mers[i] + "_mer__PCA_=_" + str(checks[i]) + ".joblib")


    print("Best model:", best_model)
    print("Best parameter (C):", grid_search.best_params_)
    print("Train accuracy:", train_accuracy)
    print("Test accuracy:", test_accuracy)




Best model: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.08697490026177834, max_iter=1000,
                                    penalty='l1', solver='liblinear'))])
Best parameter (C): {'logisticregression__C': 0.08697490026177834}
Train accuracy: 0.6125
Test accuracy: 0.6
Best model: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=9.111627561154895, max_iter=1000,
                                    penalty='l1', solver='liblinear'))])
Best parameter (C): {'logisticregression__C': 9.111627561154895}
Train accuracy: 0.6166666666666667
Test accuracy: 0.6041666666666666
Best model: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.049770235643321115, max_iter=1000,
                                    penalty='l1', solver='liblinear'))])
Be

In [4]:
C = []
N_var = []
Acc = []
k_mer = []
princ = []
for i in range(len(results)):
    C.append(results[i][1])
    Acc.append(results[i][0])
    N_var.append(results[i][2])
    k_mer.append(results[i][3][0])
    princ.append(results[i][3][1])
Extra = pd.DataFrame({'k-mer':k_mer, 'PCA': princ, 'C': C, 'Accuracy': Acc, 'Number of variables': N_var})
Extra

Unnamed: 0,k-mer,PCA,C,Accuracy,Number of variables
0,2,True,0.086975,0.6,6
1,2,False,9.111628,0.604167,16
2,4,True,0.04977,0.595833,12
3,4,False,0.04977,0.545833,15
4,6,True,0.114976,0.472917,174
5,6,False,0.065793,0.470833,74
