# Census Income Adult Data - Catboost Classification

## Imports

In [484]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Train Data

In [485]:
column_names = ["age", "work_class", "final_weight", "education", "education_num", 
                "marital_status", "occupation", "relationship", "race","sex","capital_gain",
               "capital_loss", "hours_per_week", "native_country","income_over_50k"]

In [486]:
train = pd.read_csv('adult.data', names=column_names)

In [487]:
train.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [488]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   work_class       32561 non-null  object
 2   final_weight     32561 non-null  int64 
 3   education        32561 non-null  object
 4   education_num    32561 non-null  int64 
 5   marital_status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital_gain     32561 non-null  int64 
 11  capital_loss     32561 non-null  int64 
 12  hours_per_week   32561 non-null  int64 
 13  native_country   32561 non-null  object
 14  income_over_50k  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Data Cleaning - Train data

In [489]:
# Check the unique values
train['work_class'].unique()
# it seems there is a space before each element

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [490]:
# Let's remove the space at the beginning of the work_class column elements
train['work_class'] = train['work_class'].str.lstrip()
# Let's drop the question marks
train = train[train['work_class'] != '?']

In [491]:
# Let's remove the space at the beginning fpr education column
train['education'] = train['education'].str.lstrip()

In [492]:
# Let's remove the space at the beginning of the marital_status column
train['marital_status'] = train['marital_status'].str.lstrip()

In [493]:
# Let's remove the space at the beginning of the occupation column 
train['occupation'] = train['occupation'].str.lstrip()
# Let's drop the columns with question marks
train = train[train['occupation'] != '?']

In [494]:
# Let's remove additional space of the relationship column
train['relationship'] = train['relationship'].str.lstrip()

In [495]:
# Let's remove additional space of the race column
train['race'] = train['race'].str.lstrip()

In [496]:
# Let's remove additional space of the sex column
train['sex'] = train['sex'].str.lstrip()

In [497]:
# Let's remove additional space and drop the columns with question marks from Native_country column
train['native_country'] = train['native_country'].str.lstrip()
train = train[train['native_country'] != '?']

In [498]:
# Let's remove additional space from income_over_50k column
train['income_over_50k'] = train['income_over_50k'].str.lstrip()
train['income_over_50k'].unique()

array(['<=50K', '>50K'], dtype=object)

In [499]:
# Convert income_over_50k values into 0s and 1s
train['income_over_50k'] = train['income_over_50k'].replace({'<=50K': 0, '>50K': 1})

In [500]:
train.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0


## Load Test Data

In [501]:
test = pd.read_csv('adult.test',names=column_names)

In [502]:
test.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.


In [503]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16282 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              16282 non-null  object 
 1   work_class       16281 non-null  object 
 2   final_weight     16281 non-null  float64
 3   education        16281 non-null  object 
 4   education_num    16281 non-null  float64
 5   marital_status   16281 non-null  object 
 6   occupation       16281 non-null  object 
 7   relationship     16281 non-null  object 
 8   race             16281 non-null  object 
 9   sex              16281 non-null  object 
 10  capital_gain     16281 non-null  float64
 11  capital_loss     16281 non-null  float64
 12  hours_per_week   16281 non-null  float64
 13  native_country   16281 non-null  object 
 14  income_over_50k  16281 non-null  object 
dtypes: float64(5), object(10)
memory usage: 1.9+ MB


In [504]:
# Remove the first row which is wrongly created
test = test.drop([0])

In [505]:
test.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.


In [506]:
# Remove additional space at the beginning of the string
test['work_class'] = test['work_class'].str.lstrip()
test['education'] = test['education'].str.lstrip()
test['marital_status'] = test['marital_status'].str.lstrip()
test['occupation'] = test['occupation'].str.lstrip()
test['relationship'] = test['relationship'].str.lstrip()
test['race'] = test['race'].str.lstrip()
test['sex'] = test['sex'].str.lstrip()
test['native_country'] = test['native_country'].str.lstrip()
test['income_over_50k'] = test['income_over_50k'].str.lstrip()

In [507]:
# Let's drop the rows with question marks
test = test[test['work_class'] != '?']
test = test[test['occupation'] != '?']
test = test[test['native_country'] != '?']

In [508]:
# convert the target column values to 0s and 1s
test['income_over_50k'] = test['income_over_50k'].replace({'<=50K.': 0, '>50K.': 1})

In [509]:
test['income_over_50k'].unique()

array([0, 1], dtype=int64)

### Dummy Variables

### Training Data

In [510]:
tn_work_class = pd.get_dummies(train['work_class'], prefix='wc')
tn_education = pd.get_dummies(train['education'], prefix='edu')
tn_marital_status = pd.get_dummies(train['marital_status'], prefix='mari')
tn_occupation = pd.get_dummies(train['occupation'], prefix='occup')
tn_relationship = pd.get_dummies(train['relationship'], prefix='rela')
tn_race = pd.get_dummies(train['race'], prefix='race')
tn_native_country = pd.get_dummies(train['native_country'], prefix='nat')

In [511]:
# Rename the sex variable to fit the new variable as Male or not
train.rename(columns={'sex':'male'}, inplace=True)

In [512]:
# Converting the male into 1 and Female into 0
train['male'] = train['male'].replace({'Male': 1, 'Female': 0})

In [513]:
# Let's drop the columns that we have already created Dummies of
col_drop = ['work_class','education','marital_status','occupation','relationship','race','native_country']
train = train.drop(col_drop, axis=1)

In [514]:
# join the Dummy variables with the DF
join_col = [tn_work_class,tn_education,tn_marital_status,tn_occupation,tn_relationship,tn_race,tn_native_country]
train = train.join(join_col)

In [515]:
train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Test Data

In [516]:
# Repeat the same and create Dummy Variables for for Test Data
ts_work_class = pd.get_dummies(test['work_class'], prefix='wc')
ts_education = pd.get_dummies(test['education'], prefix='edu')
ts_marital_status = pd.get_dummies(test['marital_status'], prefix='mari')
ts_occupation = pd.get_dummies(test['occupation'], prefix='occup')
ts_relationship = pd.get_dummies(test['relationship'], prefix='rela')
ts_race = pd.get_dummies(test['race'], prefix='race')
ts_native_country = pd.get_dummies(test['native_country'], prefix='nat')

In [517]:
test.rename(columns={'sex':'male'}, inplace=True)

In [518]:
test['male'] = test['male'].replace({'Male': 1, 'Female': 0})

In [519]:
col_drop = ['work_class','education','marital_status','occupation','relationship','race','native_country']
test = test.drop(col_drop, axis=1)

In [520]:
join_col1 = [ts_work_class,ts_education,ts_marital_status,ts_occupation,ts_relationship,ts_race,ts_native_country]
test = test.join(join_col1)

In [521]:
test.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
1,25,226802.0,7.0,1,0.0,0.0,40.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,89814.0,9.0,1,0.0,0.0,50.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


- We can notice that the training data has 104 columns whereas testing data has only 103 columns.
- Let's check which column

In [522]:
# Checking the column as not matching
test_cols = test.columns
train_cols = train.columns
different = []
for i in train_cols:
    if i not in test_cols:
        different.append(i)
        
different

['nat_Holand-Netherlands']

In [523]:
# Since nat_Holand-Netherlands is only available in training data, 
# Lets, drop the rows that contains nat_Holand-Netherlands as 1
# And drop the nat_Holand-Netherlands column all together
train = train[train['nat_Holand-Netherlands'] != 1]
train = train.drop('nat_Holand-Netherlands', axis=1)

In [524]:
train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Split the Data

In [525]:
X_train = train.drop('income_over_50k', axis=1)
y_train = train['income_over_50k']
X_test = test.drop('income_over_50k', axis=1)
y_test = test['income_over_50k']

In [526]:
X_train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,wc_Federal-gov,wc_Local-gov,wc_Private,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [527]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(30161, 102)
(30161,)
(15060, 102)
(15060,)


## Create and Train the Model

In [528]:
from catboost import CatBoostClassifier

In [529]:
cb_model = CatBoostClassifier()

In [530]:
cb_model.fit(X_train, y_train)

Learning rate set to 0.044122
0:	learn: 0.6529138	total: 16.9ms	remaining: 16.9s
1:	learn: 0.6109888	total: 28.1ms	remaining: 14s
2:	learn: 0.5772290	total: 37.6ms	remaining: 12.5s
3:	learn: 0.5496532	total: 47.3ms	remaining: 11.8s
4:	learn: 0.5241371	total: 57.2ms	remaining: 11.4s
5:	learn: 0.5029628	total: 66.9ms	remaining: 11.1s
6:	learn: 0.4815837	total: 76.7ms	remaining: 10.9s
7:	learn: 0.4660410	total: 86.4ms	remaining: 10.7s
8:	learn: 0.4521784	total: 96.1ms	remaining: 10.6s
9:	learn: 0.4389611	total: 106ms	remaining: 10.5s
10:	learn: 0.4267121	total: 115ms	remaining: 10.4s
11:	learn: 0.4148187	total: 125ms	remaining: 10.3s
12:	learn: 0.4067762	total: 135ms	remaining: 10.2s
13:	learn: 0.3989914	total: 145ms	remaining: 10.2s
14:	learn: 0.3940333	total: 154ms	remaining: 10.1s
15:	learn: 0.3873155	total: 164ms	remaining: 10.1s
16:	learn: 0.3809712	total: 174ms	remaining: 10s
17:	learn: 0.3751540	total: 184ms	remaining: 10.1s
18:	learn: 0.3709501	total: 196ms	remaining: 10.1s
19:	le

168:	learn: 0.2891792	total: 1.64s	remaining: 8.08s
169:	learn: 0.2890833	total: 1.65s	remaining: 8.07s
170:	learn: 0.2890138	total: 1.66s	remaining: 8.06s
171:	learn: 0.2886702	total: 1.67s	remaining: 8.05s
172:	learn: 0.2885613	total: 1.68s	remaining: 8.04s
173:	learn: 0.2884394	total: 1.69s	remaining: 8.03s
174:	learn: 0.2883598	total: 1.7s	remaining: 8.02s
175:	learn: 0.2881251	total: 1.71s	remaining: 8.01s
176:	learn: 0.2879670	total: 1.72s	remaining: 8s
177:	learn: 0.2878598	total: 1.73s	remaining: 7.98s
178:	learn: 0.2877687	total: 1.74s	remaining: 7.97s
179:	learn: 0.2877010	total: 1.75s	remaining: 7.96s
180:	learn: 0.2875988	total: 1.76s	remaining: 7.95s
181:	learn: 0.2875301	total: 1.76s	remaining: 7.93s
182:	learn: 0.2874373	total: 1.77s	remaining: 7.92s
183:	learn: 0.2873263	total: 1.78s	remaining: 7.91s
184:	learn: 0.2872451	total: 1.79s	remaining: 7.9s
185:	learn: 0.2871723	total: 1.8s	remaining: 7.89s
186:	learn: 0.2871252	total: 1.81s	remaining: 7.88s
187:	learn: 0.2868

340:	learn: 0.2702737	total: 3.26s	remaining: 6.3s
341:	learn: 0.2702167	total: 3.27s	remaining: 6.29s
342:	learn: 0.2701416	total: 3.28s	remaining: 6.28s
343:	learn: 0.2700886	total: 3.29s	remaining: 6.27s
344:	learn: 0.2700140	total: 3.3s	remaining: 6.26s
345:	learn: 0.2699897	total: 3.31s	remaining: 6.25s
346:	learn: 0.2699328	total: 3.32s	remaining: 6.24s
347:	learn: 0.2698745	total: 3.33s	remaining: 6.23s
348:	learn: 0.2697987	total: 3.33s	remaining: 6.22s
349:	learn: 0.2697345	total: 3.34s	remaining: 6.21s
350:	learn: 0.2696515	total: 3.35s	remaining: 6.2s
351:	learn: 0.2696093	total: 3.36s	remaining: 6.19s
352:	learn: 0.2695609	total: 3.37s	remaining: 6.18s
353:	learn: 0.2694904	total: 3.38s	remaining: 6.17s
354:	learn: 0.2694478	total: 3.39s	remaining: 6.16s
355:	learn: 0.2693801	total: 3.4s	remaining: 6.15s
356:	learn: 0.2693346	total: 3.41s	remaining: 6.14s
357:	learn: 0.2690869	total: 3.42s	remaining: 6.13s
358:	learn: 0.2689891	total: 3.43s	remaining: 6.12s
359:	learn: 0.26

514:	learn: 0.2598170	total: 4.89s	remaining: 4.61s
515:	learn: 0.2597632	total: 4.9s	remaining: 4.6s
516:	learn: 0.2596951	total: 4.91s	remaining: 4.59s
517:	learn: 0.2596576	total: 4.92s	remaining: 4.58s
518:	learn: 0.2596122	total: 4.93s	remaining: 4.57s
519:	learn: 0.2595764	total: 4.94s	remaining: 4.56s
520:	learn: 0.2595210	total: 4.95s	remaining: 4.55s
521:	learn: 0.2594904	total: 4.96s	remaining: 4.54s
522:	learn: 0.2594369	total: 4.97s	remaining: 4.53s
523:	learn: 0.2593806	total: 4.98s	remaining: 4.52s
524:	learn: 0.2593485	total: 4.99s	remaining: 4.51s
525:	learn: 0.2592882	total: 5s	remaining: 4.5s
526:	learn: 0.2592152	total: 5.01s	remaining: 4.49s
527:	learn: 0.2591347	total: 5.02s	remaining: 4.48s
528:	learn: 0.2590887	total: 5.03s	remaining: 4.47s
529:	learn: 0.2590336	total: 5.03s	remaining: 4.46s
530:	learn: 0.2589928	total: 5.04s	remaining: 4.46s
531:	learn: 0.2589504	total: 5.05s	remaining: 4.45s
532:	learn: 0.2589049	total: 5.06s	remaining: 4.43s
533:	learn: 0.2588

689:	learn: 0.2514049	total: 6.52s	remaining: 2.93s
690:	learn: 0.2513488	total: 6.53s	remaining: 2.92s
691:	learn: 0.2513119	total: 6.54s	remaining: 2.91s
692:	learn: 0.2512747	total: 6.55s	remaining: 2.9s
693:	learn: 0.2512134	total: 6.56s	remaining: 2.89s
694:	learn: 0.2511811	total: 6.57s	remaining: 2.88s
695:	learn: 0.2511383	total: 6.58s	remaining: 2.87s
696:	learn: 0.2510929	total: 6.58s	remaining: 2.86s
697:	learn: 0.2510307	total: 6.59s	remaining: 2.85s
698:	learn: 0.2510060	total: 6.6s	remaining: 2.84s
699:	learn: 0.2509092	total: 6.61s	remaining: 2.83s
700:	learn: 0.2508776	total: 6.62s	remaining: 2.82s
701:	learn: 0.2508564	total: 6.63s	remaining: 2.81s
702:	learn: 0.2508128	total: 6.64s	remaining: 2.81s
703:	learn: 0.2507838	total: 6.65s	remaining: 2.79s
704:	learn: 0.2507505	total: 6.66s	remaining: 2.79s
705:	learn: 0.2507023	total: 6.67s	remaining: 2.78s
706:	learn: 0.2506566	total: 6.67s	remaining: 2.77s
707:	learn: 0.2505969	total: 6.68s	remaining: 2.76s
708:	learn: 0.

864:	learn: 0.2441445	total: 8.15s	remaining: 1.27s
865:	learn: 0.2440937	total: 8.17s	remaining: 1.26s
866:	learn: 0.2440798	total: 8.18s	remaining: 1.25s
867:	learn: 0.2440376	total: 8.18s	remaining: 1.24s
868:	learn: 0.2440018	total: 8.19s	remaining: 1.24s
869:	learn: 0.2439879	total: 8.2s	remaining: 1.23s
870:	learn: 0.2439627	total: 8.21s	remaining: 1.22s
871:	learn: 0.2439254	total: 8.22s	remaining: 1.21s
872:	learn: 0.2438977	total: 8.23s	remaining: 1.2s
873:	learn: 0.2438375	total: 8.24s	remaining: 1.19s
874:	learn: 0.2437910	total: 8.25s	remaining: 1.18s
875:	learn: 0.2437415	total: 8.26s	remaining: 1.17s
876:	learn: 0.2437060	total: 8.27s	remaining: 1.16s
877:	learn: 0.2436577	total: 8.28s	remaining: 1.15s
878:	learn: 0.2436118	total: 8.28s	remaining: 1.14s
879:	learn: 0.2435665	total: 8.29s	remaining: 1.13s
880:	learn: 0.2435345	total: 8.3s	remaining: 1.12s
881:	learn: 0.2434854	total: 8.31s	remaining: 1.11s
882:	learn: 0.2434599	total: 8.32s	remaining: 1.1s
883:	learn: 0.24

<catboost.core.CatBoostClassifier at 0x1bdeb43b3d0>

## Predict and Evaluate the Model

In [531]:
predict = cb_model.predict(X_test)

In [532]:
train_pred = cb_model.predict(X_train)

In [533]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [534]:
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, predict))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, predict))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, predict))

Train Accuracy score is:
0.8954610258280561
---------------------------------
Test Accuracy score is:
0.8689907038512616
---------------------------------
Confusion matrix:
[[10656   704]
 [ 1269  2431]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.92     11360
           1       0.78      0.66      0.71      3700

    accuracy                           0.87     15060
   macro avg       0.83      0.80      0.81     15060
weighted avg       0.86      0.87      0.87     15060

