In [1]:
!pip install --force-reinstall catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import fetch_openml # Correcting typo here: 'datasetes' to 'datasets'

Collecting catboost
  Using cached catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Using cached matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2.0,>=1.16.0 (from catboost)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas>=0.24 (from catboost)
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scipy (from catboost)
  Using cached scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting plotly (from catboost)
  Using cached plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting six (from catboost)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting python-dateu

In [10]:
#load titanic dataset from openml
titanic = fetch_openml(name='titanic', version=1, as_frame=True)
df= titanic.data
df['survived']=titanic.target  #add target column

#display frist 5 row
print(df.head())

   pclass                                             name     sex      age  \
0       1                    Allen, Miss. Elisabeth Walton  female  29.0000   
1       1                   Allison, Master. Hudson Trevor    male   0.9167   
2       1                     Allison, Miss. Helen Loraine  female   2.0000   
3       1             Allison, Mr. Hudson Joshua Creighton    male  30.0000   
4       1  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   

   sibsp  parch  ticket      fare    cabin embarked boat   body  \
0      0      0   24160  211.3375       B5        S    2    NaN   
1      1      2  113781  151.5500  C22 C26        S   11    NaN   
2      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest survived  
0                     St Louis, MO        1  
1  Montreal, PQ / Chesterville

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   int64   
 5   parch      1309 non-null   int64   
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    float64 
 12  home.dest  745 non-null    object  
 13  survived   1309 non-null   category
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 116.8+ KB


In [16]:
#drop column with too many missing values or irrelevant info
df= df.drop(columns=[col for col in ['name', 'ticket','cabin', 'boat', 'body', 'home.dest'] if col in df.columns])

# The list comprehension above filters the columns to be dropped
# only if they exist in the current DataFrame (df).
# This way, it avoids KeyError by only attempting to
# drop existing columns.

#fill missing values
df['age'].fillna(df['age'].median(), inplace=True)
df['fare'].fillna(df['fare'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

#convet categorical feature to string (catboost handles them automatically)
cat_features= ['sex', 'embarked', 'pclass']
df[cat_features]=df[cat_features].astype(str)
df['age']=df['age'].astype(int)
df['fare']=df['fare'].astype(int)
#split data into features and target
X=df.drop(columns=['survived'])
y=df['survived']

# train test split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fare'].fillna(df['fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   pclass    1309 non-null   object  
 1   sex       1309 non-null   object  
 2   age       1309 non-null   int64   
 3   sibsp     1309 non-null   int64   
 4   parch     1309 non-null   int64   
 5   fare      1309 non-null   int64   
 6   embarked  1309 non-null   object  
 7   survived  1309 non-null   category
dtypes: category(1), int64(4), object(3)
memory usage: 73.1+ KB


In [30]:
#initialize catboost classifier
catboost_model =CatBoostClassifier(iterations=200,learning_rate=0.1,depth=6,cat_features=cat_features ,verbose=100 , random_seed=42)

#train model
catboost_model.fit(x_train,y_train)

0:	learn: 0.6558254	total: 4.62ms	remaining: 920ms
100:	learn: 0.3382045	total: 416ms	remaining: 408ms
199:	learn: 0.2726543	total: 930ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x79182023e890>

In [31]:

#predictions
y_pred = catboost_model.predict(x_test)
#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
#classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.7824
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       144
           1       0.87      0.61      0.72       118

    accuracy                           0.78       262
   macro avg       0.81      0.77      0.77       262
weighted avg       0.80      0.78      0.78       262



In [None]:
#why use carboost for this
-handles categorical feathers automatically (no needfor one-hot encoding)
-performs well with misssing data
-bossing algorithm id faster and more accurate
-efficient and scalable for large datasets
-provides feature importance analysis
-supports GPU acceleration for faster training