## Use GPU

In [35]:
!nvidia-smi

Fri Aug 18 19:50:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    29W /  70W |    131MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
!pip install catboost



# Import Libraries

In [37]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# EDA

In [38]:
data = pd.read_csv("https://raw.githubusercontent.com/pooja2512/Adult-Census-Income/master/adult.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [40]:
columns_with_q_mark = [col for col in data.columns if (data[col] == "?").any()]
columns_with_q_mark

['workclass', 'occupation', 'native.country']

In [41]:
for col in columns_with_q_mark:
  data[col] = data[col].replace("?" , "Unkknown")

In [42]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Unkknown,77053,HS-grad,9,Widowed,Unkknown,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Unkknown,186061,Some-college,10,Widowed,Unkknown,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [43]:
data["income"].value_counts(normalize=True) * 100

<=50K    75.919044
>50K     24.080956
Name: income, dtype: float64

# Model

In [44]:
X , y = data.drop(["income"], axis=1), data["income"]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34589, stratify=y)

In [46]:
X_train.shape, X_test.shape

((26048, 14), (6513, 14))

In [47]:
cat_features = list(X_train.select_dtypes(include="object").columns)
cat_features

['workclass',
 'education',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native.country']

In [48]:
train_pool = Pool(X_train, y_train, cat_features = cat_features)
test_pool = Pool(X_test, y_test, cat_features = cat_features)

In [49]:
model = CatBoostClassifier(
    n_estimators=300,
    task_type = "GPU")

In [52]:
param_dist = {
    "learning_rate": [0.03, 0.1, 0.3],
    "depth": [3, 5, 7],
    "l2_leaf_reg": [1, 3, 5]
}

In [53]:
search_result = model.randomized_search(
    param_dist, train_pool,
    cv=3, n_iter= 10,
    partition_random_seed= 548574, verbose=1)

0:	learn: 0.4804988	test: 0.4804323	best: 0.4804323 (0)	total: 42.4ms	remaining: 12.7s
1:	learn: 0.4102101	test: 0.4102591	best: 0.4102591 (1)	total: 80.2ms	remaining: 12s
2:	learn: 0.3751384	test: 0.3758885	best: 0.3758885 (2)	total: 109ms	remaining: 10.8s
3:	learn: 0.3512458	test: 0.3511141	best: 0.3511141 (3)	total: 134ms	remaining: 9.94s
4:	learn: 0.3403176	test: 0.3403817	best: 0.3403817 (4)	total: 164ms	remaining: 9.66s
5:	learn: 0.3317684	test: 0.3317538	best: 0.3317538 (5)	total: 202ms	remaining: 9.89s
6:	learn: 0.3258417	test: 0.3264275	best: 0.3264275 (6)	total: 232ms	remaining: 9.7s
7:	learn: 0.3207672	test: 0.3216888	best: 0.3216888 (7)	total: 257ms	remaining: 9.38s
8:	learn: 0.3185424	test: 0.3195656	best: 0.3195656 (8)	total: 280ms	remaining: 9.07s
9:	learn: 0.3156246	test: 0.3161867	best: 0.3161867 (9)	total: 304ms	remaining: 8.81s
10:	learn: 0.3123519	test: 0.3136092	best: 0.3136092 (10)	total: 328ms	remaining: 8.61s
11:	learn: 0.3104949	test: 0.3120250	best: 0.3120250 

In [54]:
search_result["params"]

{'depth': 5, 'l2_leaf_reg': 5, 'learning_rate': 0.3}