In [None]:
!pip install catboost

# Imports

In [2]:
# Basics
import pandas as pd
import numpy as np

# Visualizations
import plotly.express as px
import matplotlib.pyplot as plt

# CatBoost
from catboost import CatBoostClassifier
from catboost import Pool

# Dataset
from sklearn.datasets import load_breast_cancer

# Train test
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import confusion_matrix, f1_score

# Dataset

In [5]:
# Load data
data = load_breast_cancer()

# X
X = pd.DataFrame(data.data, columns=data.feature_names)
# y
y = data.target

In [6]:
# Missing values
X.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

# Train Test Split

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Train shapes: {X_train.shape} | {y_train.shape}')
print(f'Test shapes: {X_test.shape} | {y_test.shape}')

Train shapes: (455, 30) | (455,)
Test shapes: (114, 30) | (114,)


# Best variables

In [8]:
# Creating a Pool for training and validation sets
train_pool = Pool( data=X_train, label=y_train)
test_pool = Pool( data=X_test, label=y_test)

In [9]:
# Fit
model = CatBoostClassifier(iterations=500)
model.fit(train_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.035268
0:	learn: 0.6378119	test: 0.6355785	best: 0.6355785 (0)	total: 57.6ms	remaining: 28.8s
100:	learn: 0.0276004	test: 0.0846649	best: 0.0846649 (100)	total: 903ms	remaining: 3.57s
200:	learn: 0.0097813	test: 0.0876955	best: 0.0806486 (120)	total: 1.73s	remaining: 2.58s
300:	learn: 0.0064072	test: 0.0891089	best: 0.0806486 (120)	total: 2.54s	remaining: 1.68s
400:	learn: 0.0048285	test: 0.0860776	best: 0.0806486 (120)	total: 3.36s	remaining: 829ms
499:	learn: 0.0041032	test: 0.0842442	best: 0.0806486 (120)	total: 4.34s	remaining: 0us

bestTest = 0.08064857592
bestIteration = 120

Shrink model to first 121 iterations.


<catboost.core.CatBoostClassifier at 0x7f3c4c325280>

In [10]:
# Predict
preds = model.predict(X_test)
f1_score(y_test, preds)

0.9722222222222222

In [19]:
# Feature importances to dataframe
feature_importances = (
    pd.DataFrame({'feature': data.feature_names, 
                  'importance': model.feature_importances_})
    .sort_values(by='importance', ascending=False)
)
# Plot
px.bar(feature_importances,
       x= data.feature_names, y=model.feature_importances_,
       height=600, width=1000).update_layout(xaxis={'categoryorder':'total descending'})

Looking at those importances, we could try a model with the top 11 features, with the y axis > 3, eliminating those where the importance have a big drop, so we can make our model simpler.

In [24]:
# Simpler model
features = feature_importances.feature[:10]
# Creating a Pool for training and validation sets
train_pool2 = Pool( data=X_train[features], label=y_train)
test_pool2 = Pool( data=X_test[features], label=y_test)

# Model 
model2 = CatBoostClassifier(iterations=600)
model2.fit(train_pool2, eval_set=test_pool2, verbose=100)

# Score
preds2 = model2.predict(test_pool2)
f1_score(y_test, preds2)

Learning rate set to 0.032579
0:	learn: 0.6459214	test: 0.6461827	best: 0.6461827 (0)	total: 15.1ms	remaining: 9.02s
100:	learn: 0.0316963	test: 0.0815492	best: 0.0815492 (100)	total: 916ms	remaining: 4.52s
200:	learn: 0.0121172	test: 0.0782300	best: 0.0776804 (155)	total: 1.4s	remaining: 2.78s
300:	learn: 0.0064701	test: 0.0753114	best: 0.0751282 (296)	total: 1.72s	remaining: 1.71s
400:	learn: 0.0045198	test: 0.0756171	best: 0.0749792 (304)	total: 2.05s	remaining: 1.02s
500:	learn: 0.0035669	test: 0.0769357	best: 0.0749792 (304)	total: 2.38s	remaining: 470ms
599:	learn: 0.0031642	test: 0.0773270	best: 0.0749792 (304)	total: 2.69s	remaining: 0us

bestTest = 0.07497916933
bestIteration = 304

Shrink model to first 305 iterations.


0.979020979020979

As observed, we didn't have a major drop in test score. So let's move on with this simpler model.

# Threshold
As we are working with medical diagnosis, we should not be very tolerant to false negatives. We would want our model to say the pacient is healthy only if we have a huge certainty that he is actually healthy. Otherwise, we might want this person to take another test.
But we know that Catboost algorithm uses the 50% threshold to predict the outcome. Meaning that, if the probability is under 50%, the pacient will be diagnosed as negative for breast cancer. But we can tweak that number to make it give a negative prediction only for a higher amount of certainty.

Let's see how's that done.

In [39]:
# Regular predictions
default_preds = pd.DataFrame(model2.predict_proba(test_pool2).round(3))
default_preds['classification'] = model2.predict(test_pool2)
default_preds.sample(10)

Unnamed: 0,0,1,classification
49,0.001,0.999,1
2,0.999,0.001,0
67,1.0,0.0,0
82,0.634,0.366,0
86,0.995,0.005,0
59,0.0,1.0,1
34,0.0,1.0,1
103,0.999,0.001,0
52,0.123,0.877,1
5,1.0,0.0,0


In [40]:
from catboost.utils import select_threshold
# Finding the right threshold
print(select_threshold(model2, test_pool2, FNR=0.01))

0.1420309044590601


Now that we know which threshold gives us the false negative rate of 1% only, we can run the predictions and make the final classification.

In [42]:
from logging import error
# Creat function to predict with new threshold
def predict_threshold(df, threshold, rate_type):
  '''Input the dataframe with the explaining variable and have the classifications returned
  * df: dataframe = Explanatory variables
  * FNR: float = amount allowed for False Negatives
  * FPR: float = amount allowed for False positives 
  Returns: predictions array'''
  
  # Check either it's false or positive rate threshold
  if rate_type == "FNR":
    thld = select_threshold(model2, df, FNR=threshold)
    print(thld)
    #Predict
    predictions = model2.predict_proba(df)
    negative_proba = np.array( [predictions[i][0] for i in range(len(predictions))] )
    # Calculate predictions with threshold
    predictions_threshold = [0 if pred >= (1-thld) else 1 for pred in negative_proba]

  elif rate_type == "FPR":
    thld = select_threshold(model2, df, FPR=threshold)
    #Predict
    predictions = model2.predict_proba(df)
    positive_proba = np.array( [predictions[i][1] for i in range(len(predictions))] )
    # Calculate predictions with threshold
    predictions_threshold = [1 if pred >= (1-thld) else 0 for pred in positive_proba]

  else:
    raise ValueError('rate_type must be either "FPR" or "FNR"')

  #Return
  return predictions_threshold


In [44]:
# Predict
new_predictions = predict_threshold(test_pool2, threshold= 0.01, rate_type="FNR")
normal_predictions = model2.predict(test_pool2)

0.1420309044590601


In [45]:
# Confusion Matrix 50% standard threshold
pd.DataFrame( confusion_matrix(y_true=y_test, y_pred=normal_predictions) )

Unnamed: 0,0,1
0,41,2
1,1,70


In [46]:
# Confusion Matrix 1% of false negatives allowed threshold
pd.DataFrame( confusion_matrix(y_true=y_test, y_pred=new_predictions) )

Unnamed: 0,0,1
0,40,3
1,0,71
