In [13]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd

# Generate some example data
#X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
Meth = pd.read_csv("processed_data\combined_data\with_background\combined_2mers_meth_with_background.tsv", sep="\t")
Unmeth = pd.read_csv("processed_data\combined_data\with_background\combined_2mers_unmeth_with_background.tsv", sep="\t")
Meth = Meth.drop(306)
Unmeth = Unmeth.drop(306)
Combined = [Meth,Unmeth]
Healthy_Meth = Meth.loc[Meth["cancer"]=="Healthy"]
Healthy_Unmeth = Unmeth.loc[Unmeth["cancer"]=="Healthy"]
Data0 = Healthy_Meth
Data1 = Healthy_Unmeth
Combined = [Data0,Data1]
X = pd.concat(Combined)
X = X.iloc[: , :-1]
y = [0] * Data0.shape[0] + [1] * Data1.shape[0]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)



# Create a pipeline with standardization and logistic regression with L1 regularization
pipe = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear'))

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'logisticregression__C': np.logspace(-3, 3, 7)  # Values for regularization parameter C
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate the best model
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)

print("Best model:", best_model)
print("Best parameter (C):", grid_search.best_params_)
print("Train accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)


Best model: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=100.0, penalty='l1',
                                    solver='liblinear'))])
Best parameter (C): {'logisticregression__C': 100.0}
Train accuracy: 1.0
Test accuracy: 1.0
