In [88]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [89]:
df = pd.read_csv("data.csv")
df.drop(columns=["id"], inplace=True)
df.sample(5)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
176,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,0.1669,...,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
12,M,19.17,24.8,132.4,1123.0,0.0974,0.2458,0.2065,0.1118,0.2397,...,20.96,29.94,151.7,1332.0,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
364,B,13.4,16.95,85.48,552.4,0.07937,0.05696,0.02181,0.01473,0.165,...,14.73,21.7,93.76,663.5,0.1213,0.1676,0.1364,0.06987,0.2741,0.07582
239,M,17.46,39.28,113.4,920.6,0.09812,0.1298,0.1417,0.08811,0.1809,...,22.51,44.87,141.2,1408.0,0.1365,0.3735,0.3241,0.2066,0.2853,0.08496
426,B,10.48,14.98,67.49,333.6,0.09816,0.1013,0.06335,0.02218,0.1925,...,12.13,21.57,81.41,440.4,0.1327,0.2996,0.2939,0.0931,0.302,0.09646


In [90]:
imp_features = [
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean"
]

[
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean"
]


In [91]:
skewness = df.skew(numeric_only=True)
high_skew_cols = skewness[abs(skewness) > 1].index.tolist()
high_skew_cols = [col for col in imp_features if col in high_skew_cols]
print(high_skew_cols)

['area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'fractal_dimension_mean']


In [92]:
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]
y = y.replace({'B':0, 'M':1})
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=780, shuffle=True)
rest_cols = X.drop(high_skew_cols, axis=1)
rest_cols.drop(columns=['radius_mean', 'texture_mean', 'perimeter_mean', 'texture_worst'], axis=1, inplace=True)
# X.columns


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [93]:
# These columns are rest of high skew columns
rest_cols_test = [col for col in imp_features if col not in high_skew_cols]
print(rest_cols_test)
# These columns have high scaled data, so we'll use minmax scaling on them
remove_list = ['radius_mean', 'texture_mean', 'perimeter_mean', 'texture_worst']
rest_cols_test = [col for col in rest_cols_test if col not in remove_list]
print(rest_cols_test)


['radius_mean', 'texture_mean', 'perimeter_mean', 'smoothness_mean', 'symmetry_mean']
['smoothness_mean', 'symmetry_mean']


In [94]:
trf = ColumnTransformer(transformers=[
	('log_transform', FunctionTransformer(np.log1p), high_skew_cols),
	('minmax_scaler', MinMaxScaler(), ['radius_mean', 'texture_mean', 'perimeter_mean']),
	('std_scaler', StandardScaler(), rest_cols_test)

], remainder='drop')

In [95]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.2,
    max_depth=6, #10
    subsample=0.7,
    colsample_bytree=0.6,
    min_child_weight=1,
    gamma=0.1,
    reg_alpha=0,
    reg_lambda=1,
    tree_method="hist",
    eval_metric="logloss"
)
gbc = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=12,
    learning_rate=0.1511391862713393,
    min_samples_split=18,
    min_samples_leaf=12,
    subsample=0.5667448143125229,
    max_features='sqrt',
    min_weight_fraction_leaf=0.03155619870376579,
    max_leaf_nodes=27,
    min_impurity_decrease=0.06913906881688014,
    ccp_alpha=1.1886265397500649e-05,
)

"""
Best Parameters:
  n_estimators: 100
  max_depth: 12
  learning_rate: 0.1511391862713393
  min_samples_split: 18
  min_samples_leaf: 12
  subsample: 0.5667448143125229
  max_features: sqrt
  min_weight_fraction_leaf: 0.03155619870376579
  max_leaf_nodes: 27
  min_impurity_decrease: 0.06913906881688014
  ccp_alpha: 1.1886265397500649e-05

"""

'\nBest Parameters:\n  n_estimators: 100\n  max_depth: 12\n  learning_rate: 0.1511391862713393\n  min_samples_split: 18\n  min_samples_leaf: 12\n  subsample: 0.5667448143125229\n  max_features: sqrt\n  min_weight_fraction_leaf: 0.03155619870376579\n  max_leaf_nodes: 27\n  min_impurity_decrease: 0.06913906881688014\n  ccp_alpha: 1.1886265397500649e-05\n\n'

In [96]:
pipeline = Pipeline([
	('trf', trf),
	# ('smote', SMOTE()),
	('model', gbc)
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(scores.mean())

0.9314547430523211


In [97]:
print(confusion_matrix(y_test, y_pred)) # no, yes 
print(recall_score(y_test, y_pred, pos_label=1))

[[71  2]
 [ 2 39]]
0.9512195121951219


In [100]:
import joblib
joblib.dump(pipeline.named_steps['model'], 'model.pkl')
joblib.dump(pipeline.named_steps['trf'], 'scaler.pkl')

['scaler.pkl']

In [None]:
# import optuna
# from optuna.samplers import TPESampler
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import recall_score, make_scorer, accuracy_score, confusion_matrix, classification_report
# import numpy as np

# # Custom scorer - prioritizing recall for cancer detection
# recall_scorer = make_scorer(recall_score, pos_label=1)

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 800, step=50),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 30),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
#         'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.3),
#         'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 100),
#         'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.1),
#         'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.05),  # Pruning parameter
#         'validation_fraction': 0.1,
#         'n_iter_no_change': 10,  # Early stopping
#         'tol': 1e-4,
#         'random_state': 42
#     }
    
#     model = GradientBoostingClassifier(**params)
    
#     # Stratified K-Fold for imbalanced data
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
#     # Use recall as primary metric
#     scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=recall_scorer, n_jobs=-1)
    
#     return scores.mean()

# # Create study
# study = optuna.create_study(
#     direction='maximize',
#     sampler=TPESampler(seed=42),
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
# )

# # Run optimization
# study.optimize(
#     objective,
#     n_trials=150,        # GBC is slower, so fewer trials
#     timeout=3600,        # 1 hour timeout
#     show_progress_bar=True
# )

# # Results
# print("=" * 60)
# print(f"Best Recall Score: {study.best_value:.4f}")
# print("=" * 60)
# print("Best Parameters:")
# for key, value in study.best_params.items():
#     print(f"  {key}: {value}")

# # Train final model with best params
# best_params = study.best_params.copy()
# best_params['random_state'] = 42
# best_params['validation_fraction'] = 0.1
# best_params['n_iter_no_change'] = 10

# best_gbc = GradientBoostingClassifier(**best_params)
# best_gbc.fit(X_train, y_train)

# # Evaluate
# y_pred = best_gbc.predict(X_test)
# print("\n" + "=" * 60)
# print("Final Evaluation:")
# print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
# print(f"Recall:    {recall_score(y_test, y_pred, pos_label=1):.4f}")
# print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
# print(f"\nClassification Report:\n{classification_report(y_test, y_pred, digits=4)}")

# # Visualization
# optuna.visualization.plot_optimization_history(study).show()
# optuna.visualization.plot_param_importances(study).show()