In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary


In [4]:
!ls

sample_data  ToyData.txt


In [5]:
df=pd.read_csv("ToyData.txt")

In [6]:
df.head()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1
3,2583.0,2358.0,120.0,360.0,Yes,1
4,6000.0,0.0,141.0,360.0,Yes,1


In [7]:

print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

Total number of rows in dataset = 614
Total number of columns in dataset = 6


In [8]:
result = resumetable(df)
result


Dataset Shape: (614, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,ApplicantIncome,float64,2,503,5849.0,4583.0
1,CoapplicantIncome,float64,2,287,0.0,1508.0
2,LoanAmount,float64,3,203,0.0,128.0
3,Loan_Amount_Term,float64,2,11,360.0,360.0
4,Credit_History,object,0,2,Yes,Yes
5,Loan_Status,int64,0,2,1,0


In [9]:

target_col = "Loan_Status"
X = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

In [11]:

X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
244,3406.0,4417.0,123.0,360.0,Yes
393,1993.0,1625.0,113.0,180.0,Yes
310,2917.0,0.0,84.0,360.0,Yes
408,8300.0,0.0,152.0,300.0,No
572,16666.0,0.0,275.0,360.0,Yes


In [12]:
features = list(X_train.columns)

In [13]:
cat_features = ["Credit_History"]

In [15]:
#no data preprocessing is done, directly applied model
#this is because catboost require less preprocessing
#not convert cat. variables to num.
#havent removed missing
model_cb = CatBoostClassifier(task_type='GPU', iterations=100,
                              random_state = 2021,
                              eval_metric="F1")

In [16]:
model_cb.fit(X_train, y_train, cat_features= cat_features, plot=True,
             eval_set=(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.199227
0:	learn: 0.8571429	test: 0.7885305	best: 0.7885305 (0)	total: 43.7ms	remaining: 4.33s
1:	learn: 0.8553259	test: 0.8275862	best: 0.8275862 (1)	total: 67.3ms	remaining: 3.29s
2:	learn: 0.8562300	test: 0.8251748	best: 0.8275862 (1)	total: 93.8ms	remaining: 3.03s
3:	learn: 0.8576000	test: 0.8309859	best: 0.8309859 (3)	total: 104ms	remaining: 2.51s
4:	learn: 0.8557692	test: 0.8309859	best: 0.8309859 (3)	total: 133ms	remaining: 2.54s
5:	learn: 0.8539326	test: 0.8339223	best: 0.8339223 (5)	total: 146ms	remaining: 2.28s
6:	learn: 0.8520900	test: 0.8309859	best: 0.8339223 (5)	total: 165ms	remaining: 2.19s
7:	learn: 0.8502415	test: 0.8398577	best: 0.8398577 (7)	total: 225ms	remaining: 2.59s
8:	learn: 0.8520900	test: 0.8309859	best: 0.8398577 (7)	total: 282ms	remaining: 2.85s
9:	learn: 0.8520900	test: 0.8368794	best: 0.8398577 (7)	total: 308ms	remaining: 2.77s
10:	learn: 0.8520900	test: 0.8309859	best: 0.8398577 (7)	total: 331ms	remaining: 2.67s
11:	learn: 0.8539326

<catboost.core.CatBoostClassifier at 0x7845b6160b80>

In [17]:
y_pred = model_cb.predict(X_test)

In [18]:
f1_score(y_test, y_pred)

0.8398576512455516

In [19]:
accuracy_score(y_test, y_pred)

0.7783251231527094