In [1]:
# import matplotlib as plt
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import warnings as datawarnings
datawarnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
        
        

In [2]:
# df = pd.read_pickle('df_final_feature.pkl')
df = pd.read_pickle('df_final_features.pkl')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int8   
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

# split data into train, dev, and test

In [5]:
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model selction

Evaluate the performance of four different classification models:

SVC, XGBoost, Gradient Boosting, and Random Forest

since the data did not follow a normal distribution, models relying on normality assumptions were excluded.

Outputs: For each model, it prints:

The confusion matrix, which shows the performance of the model in terms of true and false predictions.

The classification report, which provides precision, recall, F1-score, and support for each class.

In [7]:
import sklearn.metrics as metrics 
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

In [8]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro'),
    'log_loss': make_scorer(log_loss, needs_proba=True),
    'auc': make_scorer(roc_auc_score, average='macro', multi_class='ovr')
}

In [9]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

models = [SVC(), XGBClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]
model_names = ["SVM", "XGBoost", "Gradient Boosting", "Random Forest"]
models_list = pd.DataFrame()
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_dict = {'model': name}

    print(f"\n\nmodel: {name}\n")
    print(confusion_matrix(y_test,y_pred))
    print()
    print(classification_report(y_test,y_pred))
    





model: SVM

[[108   0]
 [ 11  52]]

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       108
           1       1.00      0.83      0.90        63

    accuracy                           0.94       171
   macro avg       0.95      0.91      0.93       171
weighted avg       0.94      0.94      0.93       171



model: XGBoost

[[107   1]
 [  2  61]]

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       108
           1       0.98      0.97      0.98        63

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



model: Gradient Boosting

[[105   3]
 [  4  59]]

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       108
           1       0.95      0.94      0.94        63

    accuracy                           0.96       

## Summary of model selection results
Overall Conclusion

**XGBoost has the highest overall** accuracy (0.98) and performs consistently well across all metrics (precision, recall, F1-score). It's the most balanced and robust model among the four.
SVM has slightly lower recall for Class 1, which could be a concern if capturing all positives in this class is crucial.
Gradient Boosting and Random Forest are also strong contenders, both achieving high accuracy and balanced precision and recall. However, they fall slightly short of XGBoost in overall performance.

**Recommendation: continue to the next step of hyperparameter tuning with XGBoost and SVC**

# Hyperparameters tuning

Defines Hyperparameter Grids: Specifies ranges of hyperparameters for SVC and XGBoost to be tuned.

Uses GridSearchCV: To find the best hyperparameters by evaluating all possible combinations using cross-validation.

Evaluates Models: After tuning, the models are evaluated on the test set, and results are printed, including confusion matrices and classification reports, to understand their performance.


In [13]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold

**Define Hyperparameter Grids:**

In [15]:
svc_param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}

xgb_param_grid = {
    'n_estimators': [100, 200],  # Number of trees to build
    'max_depth': [3, 4],         # Depth of each tree, generally controls overfitting
    'learning_rate': [0.1, 0.2],  # Step size at each iteration, balances learning
    'subsample': [0.8, 1.0],     # Fraction of samples to use for training each tree
    'colsample_bytree': [0.8, 1.0],  # Fraction of features to use for each tree
    'gamma': [0, 0.1],           # Regularization parameter, helps with model complexity
    'min_child_weight': [1, 3]   # Minimum sum of instance weight for a child, controls overfitting
}


**Perform Hyperparameter Tuning:**

In [17]:
# SVM Tuning
svc_cv = GridSearchCV(SVC(),svc_param_grid,refit=True)
svc_cv.fit(X_train, y_train)

# Retrieve Best Parameters
best_svc_params = svc_cv.best_params_
best_svc_estimators = svc_cv.best_estimator_

In [18]:
# XGBoost Tuning
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb = XGBClassifier()
xgb_cv = GridSearchCV(xgb, xgb_param_grid, refit=True)
xgb_cv.fit(X_train, y_train)

# Retrieve Best Parameters
best_xgb_params = xgb_cv.best_params_
best_xgb_estimators = xgb_cv.best_estimator_

In [19]:
print("SVC result:")
print(best_svc_params)
print(best_svc_estimators)

SVC result:
{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=100, gamma=0.0001)


In [20]:
print("XGBoost result:")
print(best_xgb_params)
print(best_xgb_estimators)

XGBoost result:
{'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)


**Evaluate Tuned Models:**

In [22]:
tuned_models = [xgb_cv, XGBClassifier(), svc_cv, SVC()]
tuned_model_names = ["Tuned_XGBoost","XGBoost", "Tuned_SVC", "SVC"] 
tuned_models_list = pd.DataFrame()
for model, name in zip(tuned_models, tuned_model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n\nmodel: {name}\n")
    print(f"\nTest:")
    print(confusion_matrix(y_test,y_pred))
    print()
    print(classification_report(y_test,y_pred))



model: Tuned_XGBoost


Test:
[[104   4]
 [  3  60]]

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       108
           1       0.94      0.95      0.94        63

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



model: XGBoost


Test:
[[107   1]
 [  2  61]]

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       108
           1       0.98      0.97      0.98        63

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



model: Tuned_SVC


Test:
[[106   2]
 [  6  57]]

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       108
           1       0.97      0.90      0.93        63

    accuracy               

## Tuning results

**XGBoost (not tuned)**

**shows the best overall performance** with high accuracy, precision, recall, and F1-scores for both classes. It appears to generalize well and could be the preferred model if you want high performance across the board.

**Tuned XGBoost** 

performs slightly worse than the not-tuned version, which suggests that the tuning might not have been effective.

**Tuned SVC** 

shows improvements over the untuned version and performs quite well, especially in precision for class 1. However, it has slightly lower recall for class 1 compared to the XGBoost models.

**SVC (not tuned)**

has lower overall performance, especially in terms of recall for malignant cases, which could be a concern in medical diagnostics.
Recommendations

**XGBoost (not tuned) appears to be the most robust model, achieving the highest accuracy and balanced performance metrics. Consider using this model if maximizing overall performance is the goal.**

Tuned XGBoost and Tuned SVC both offer good performance but with slightly different strengths and weaknesses.
Depending on your focus (e.g., precision vs. recall), you might choose between these models.

SVC (not tuned), while showing high precision for malignant cases, might be less reliable due to lower recall. 
In a medical context where detecting malignant cases accurately is crucial, XGBoost or Tuned SVC might be preferable due to their higher recall for malignant cases.

# cross-validation

**XGB cross-validation**

In [26]:
from sklearn.model_selection import cross_val_score

# Define your model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation results
print(f'XGBoost Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores)}')
print(f'Standard deviation of CV accuracy: {np.std(cv_scores)}')

XGBoost Cross-validation scores: [0.97368421 0.95614035 0.99122807 0.98245614 0.98230088]
Mean CV accuracy: 0.9771619313771154
Standard deviation of CV accuracy: 0.011885244147814054


**Tuned XGB cross-validation**

In [28]:
from sklearn.model_selection import cross_val_score

# Define your model
xgb_model = xgb_cv

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation results
print(f'XGBoost TUNED  Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores)}')
print(f'Standard deviation of CV accuracy: {np.std(cv_scores)}')

XGBoost TUNED  Cross-validation scores: [0.96491228 0.96491228 0.99122807 0.97368421 0.97345133]
Mean CV accuracy: 0.9736376339077782
Standard deviation of CV accuracy: 0.009609619188189153


**SVC cross-validation**

In [30]:
# Define your model
svc_model = SVC(kernel='rbf', C=1, gamma=0.1)

# Perform cross-validation
cv_scores = cross_val_score(svc_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation results
print(f'SVC Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores)}')
print(f'Standard deviation of CV accuracy: {np.std(cv_scores)}')

SVC Cross-validation scores: [0.62280702 0.62280702 0.63157895 0.63157895 0.62831858]
Mean CV accuracy: 0.6274181027790716
Standard deviation of CV accuracy: 0.003948679172659169


**Tuned SVC cross-validation**

In [32]:
# Define your model
svc_model = svc_cv

# Perform cross-validation
cv_scores = cross_val_score(svc_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation results
print(f'SVC TUNED Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores)}')
print(f'Standard deviation of CV accuracy: {np.std(cv_scores)}')

SVC TUNED Cross-validation scores: [0.90350877 0.93859649 0.94736842 0.93859649 0.92035398]
Mean CV accuracy: 0.9296848315478963
Standard deviation of CV accuracy: 0.0157720990301339


## cross-validation results
**Models:**

XGBoost (Tuned and Not Tuned)
SVC (Tuned and Not Tuned)

**Conclusion:**

<u>XGBoost:</u>

Both tuned and not-tuned models show excellent performance, with high mean accuracy scores and relatively low variability.
Tuning had a minor impact on the performance of XGBoost in this case. Both versions of XGBoost are performing similarly well.

<u>SVC:</u>

Tuning the SVC model significantly improved its performance. The not tuned model had much lower accuracy compared to the tuned version.
Performance variability is higher in the tuned SVC model but still maintains a good average accuracy.
Recommendations
XGBoost: Both versions perform well. If further improvement is needed, consider exploring additional hyperparameter tuning or advanced techniques such as feature engineering or ensemble methods.

SVC: The tuned SVC model is significantly better than the not tuned version. Continue using the tuned model for its improved performance. However, be mindful of the variability in performance and consider adjusting the hyperparameters further if necessary.

# Overall conclusion

## XGBoost consistently outperformed other models, making it the preferred choice 


### confusion matrix
Test:
[[107   1]
 [  2  61]]

### classification_report
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       108
           1       0.98      0.97      0.98        63

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

### Cross-validation: 

scores: [0.97368421 0.95614035 0.99122807 0.98245614 0.98230088]

Mean CV accuracy: 0.9771619313771154

Standard deviation of CV accuracy: 0.011885244147814054