# Librerías

In [None]:
# Instalar la librería CatBoost

!pip install catboost



In [None]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier, Pool
from catboost.datasets import titanic

# Generar dataset y particiones

In [None]:
# Creamos los datos con una función generadora
data = make_classification(n_samples=10000, n_features=10, n_redundant=0, n_informative=6, random_state=0)
data[0][:1]

array([[ 1.04393981,  0.80049521,  2.1372495 ,  0.80725272, -1.76199261,
        -0.84779888, -1.18563387,  0.06822139, -0.73341224,  1.24358963]])

In [None]:
# Generamos particiones de train y test
train_X, test_X, train_y, test_y = train_test_split(data[0], data[1], test_size=0.25, random_state=0)

# Comparación de performance entre modelos
Se usan los valores predefinidos.

In [None]:
# Regresión Logística

lr = LogisticRegression(random_state=0)
lr.fit(train_X, train_y)
lr_preds = lr.predict_proba(test_X)
lr_auc = roc_auc_score(test_y, lr_preds[:, 1])
print("El desempeño de una regresión logística es:", "{:.2f}".format(lr_auc))

El desempeño de una regresión logística es: 0.87


In [None]:
# Gradient Boosting

gbt = GradientBoostingClassifier(random_state=0)
gbt.fit(train_X, train_y)
gbt_preds = gbt.predict_proba(test_X)
gbt_auc = roc_auc_score(test_y, gbt_preds[:, 1])
print("El desempeño de un gradient boosting es:", "{:.2f}".format(gbt_auc))

El desempeño de un gradient boosting es: 0.95


# CatBoost

In [None]:
# CatBoost

cbc = CatBoostClassifier(random_state=0)
cbc.fit(train_X, train_y, verbose=0)
cbc.predict_proba(test_X)
cbc_preds = cbc.predict_proba(test_X)
cbc_auc = roc_auc_score(test_y, cbc_preds[:, 1])
print("El desempeño de un catboost es:", "{:.2f}".format(cbc_auc))

El desempeño de un catboost es: 0.98


### Catboost con data heterogénea

In [None]:
# Vamos a cargar el dataset del Titanic

train_df, test_df = titanic()
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Imputación de nulos
null_value_stats = train_df.isnull().sum(axis=0)
print(null_value_stats[null_value_stats != 0])

# Se rellenan con un valor fuera de rango
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Age         177
Cabin       687
Embarked      2
dtype: int64


In [None]:
# Separación entre variables independientes y dependiente
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

# A partir de la muestra de entrenamiento generamos una muestra de validación
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

In [None]:
# Se generan los objetos Pool para el modelo
cat_features = [feat for feat in X_train.columns.values if X_train[feat].dtype.name in ['category', 'object']]
train_pool = Pool(data = X_train, label = y_train, cat_features = cat_features)
valid_pool = Pool(data=X_validation, label=y_validation, cat_features=cat_features)

In [None]:
# Definición de hiperparámetros
params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'cat_features': cat_features,
          'task_type': 'GPU',
          'verbose': 200,
          'random_seed': 0,
          'iterations': 1000,
          'early_stopping_rounds': 200}

catboost_model = CatBoostClassifier(**params)

In [None]:
print("Entrenando el modelo ... \n")
catboost_model.fit(train_pool,
                   eval_set=valid_pool,
                   verbose = 100,
                   use_best_model = True)
print("\nSe ha hecho fit del modelo")

Entrenando el modelo ... 

Learning rate set to 0.073692
0:	learn: 0.7648840	test: 0.7718430	best: 0.7718430 (0)	total: 75.4ms	remaining: 1m 15s
100:	learn: 0.8865803	test: 0.8992956	best: 0.9027754 (86)	total: 4.65s	remaining: 41.4s
200:	learn: 0.8918996	test: 0.9025658	best: 0.9029851 (191)	total: 8.87s	remaining: 35.2s
300:	learn: 0.8975142	test: 0.9048298	best: 0.9048298 (297)	total: 12.8s	remaining: 29.7s
400:	learn: 0.9060765	test: 0.9070099	best: 0.9073453 (394)	total: 16.7s	remaining: 24.9s
500:	learn: 0.9089957	test: 0.9054167	best: 0.9075968 (424)	total: 20.5s	remaining: 20.4s
600:	learn: 0.9118101	test: 0.9064229	best: 0.9075968 (424)	total: 24.2s	remaining: 16.1s
bestTest = 0.9075968266
bestIteration = 424
Shrink model to first 425 iterations.

Se ha hecho fit del modelo


In [None]:
## Importancia de variables
feature_importances = catboost_model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 45.37530916091101
Ticket: 13.456363952172055
Pclass: 13.392484516286162
Age: 9.220809005493265
Fare: 5.375663443436288
PassengerId: 4.308462494892371
Embarked: 3.239776975375722
Cabin: 3.156052956304829
SibSp: 1.219844721947187
Name: 0.825181536844796
Parch: 0.43005123633633097
