# Model building and Testing

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.model_selection import train_test_split

### Pre requisites

In [17]:
data = pd.read_csv("breast+cancer+wisconsin+diagnostic\data_resampled.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,0,895100,1,20.34,21.51,135.9,1264.0,0.117,0.1875,0.2565,...,25.3,31.86,171.1,1938.0,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
1,1,905686,0,11.89,21.17,76.39,433.8,0.09773,0.0812,0.02555,...,13.05,27.21,85.09,522.9,0.1426,0.2187,0.1164,0.08263,0.3075,0.07351
2,2,884180,1,19.4,23.5,129.1,1155.0,0.1027,0.1558,0.2049,...,21.65,30.53,144.9,1417.0,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
3,3,859983,1,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,...,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
4,4,871122,0,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,...,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898


In [18]:
data = data.drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,895100,1,20.34,21.51,135.9,1264.0,0.117,0.1875,0.2565,0.1504,...,25.3,31.86,171.1,1938.0,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
1,905686,0,11.89,21.17,76.39,433.8,0.09773,0.0812,0.02555,0.02179,...,13.05,27.21,85.09,522.9,0.1426,0.2187,0.1164,0.08263,0.3075,0.07351
2,884180,1,19.4,23.5,129.1,1155.0,0.1027,0.1558,0.2049,0.08886,...,21.65,30.53,144.9,1417.0,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
3,859983,1,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,...,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
4,871122,0,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,...,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898


In [20]:
data.shape

(422, 27)

### Train Test Split

In [22]:
X = data.drop(columns=['Diagnosis', 'ID'])
y = data['Diagnosis']
X.head() 

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave points1,symmetry1,radius2,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,20.34,21.51,135.9,1264.0,0.117,0.1875,0.2565,0.1504,0.2569,0.5702,...,25.3,31.86,171.1,1938.0,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
1,11.89,21.17,76.39,433.8,0.09773,0.0812,0.02555,0.02179,0.2019,0.2747,...,13.05,27.21,85.09,522.9,0.1426,0.2187,0.1164,0.08263,0.3075,0.07351
2,19.4,23.5,129.1,1155.0,0.1027,0.1558,0.2049,0.08886,0.1978,0.5243,...,21.65,30.53,144.9,1417.0,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
3,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,0.1662,0.2787,...,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
4,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.1822,...,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898


In [32]:
X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Model Fitting and testing

In [33]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from xgboost import XGBClassifier

In [38]:
models = {
    'Logistic Regression' : lr(),
    'Descision Tree' : dt(),
    'Random Forest' : rf(),
    'SVC' : SVC(),
    'XGB Classifier' : XGBClassifier()
}

results = []
for name, model in models.items():
    model.fit(X_Train, y_train)
    y_pred = model.predict(X_Test)
    results.append({
        'Model' : name,
        'Accuracy' : accuracy_score(y_test, y_pred),
        'Precision' : precision_score(y_test, y_pred, pos_label=1),
        'Recall' : recall_score(y_test, y_pred, pos_label=1),
        'F1 Score' : f1_score(y_test, y_pred, pos_label=1)
    })

scoreboard = pd.DataFrame(results)
scoreboard

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.924528,0.957447,0.882353,0.918367
1,Descision Tree,0.962264,0.960784,0.960784,0.960784
2,Random Forest,0.95283,0.979167,0.921569,0.949495
3,SVC,0.867925,0.911111,0.803922,0.854167
4,XGB Classifier,0.962264,0.979592,0.941176,0.96


XGBoost and Descision Tree Classifers had the best F1 score. So they will now be used during hyperparameter training

### Hyperparameter training

In [31]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

In [44]:
def objective_dt(trial):
    params = {
        'max_depth' : trial.suggest_int('max_depth', 2, 50),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 50 ),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 2, 50),
        'min_weight_fraction_leaf' : trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        'max_features' : trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }

    model = dt(**params, random_state=42)
    score = cross_val_score(model, X_Train, y_train, cv=5, scoring=make_scorer(f1_score, pos_label="M")).mean()
    return score




In [40]:
import optuna.visualization as vis



In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_dt, n_trials=50)

op_his = vis.plot_optimization_history(study)
op_his.show()
par_imp = vis.plot_param_importances(study)
par_imp.show()

best_score = study.best_value 
best_params = study.best_params    

print("Best recall:", best_score)
print("Best parameters:", best_params)


[I 2025-09-28 19:37:20,929] A new study created in memory with name: no-name-0cea0b0a-666e-49fe-8936-dcb93a920dce
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 152, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 400, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_response.py", line 207, in _get_response_values
    raise ValueError(
ValueError: pos_label=M is not a valid label: It should be one of [0 1]

Traceback (most recent call l

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 3000),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5)
    }

    model = XGBClassifier(**params, random_state=42, eval_metric='logloss')
    score = cross_val_score(model, X_Train, y_train, cv=5, scoring=make_scorer(f1_score, pos_label="M")).mean()
    return score
