In [1]:
!pip list

Package           Version
----------------- -----------
asttokens         2.4.1
colorama          0.4.6
comm              0.2.2
contourpy         1.3.0
cycler            0.12.1
debugpy           1.8.5
decorator         5.1.1
executing         2.1.0
fonttools         4.53.1
ipykernel         6.29.5
ipython           8.27.0
jedi              0.19.1
joblib            1.4.2
jupyter_client    8.6.2
jupyter_core      5.7.2
kiwisolver        1.4.7
matplotlib        3.9.2
matplotlib-inline 0.1.7
nest-asyncio      1.6.0
numpy             2.1.1
packaging         24.1
pandas            2.2.2
parso             0.8.4
pillow            10.4.0
pip               24.2
platformdirs      4.3.2
prompt_toolkit    3.0.47
psutil            6.0.0
pure_eval         0.2.3
Pygments          2.18.0
pyparsing         3.1.4
python-dateutil   2.9.0.post0
pytz              2024.1
pywin32           306
pyzmq             26.2.0
scikit-learn      1.5.1
scipy             1.14.1
seaborn           0.13.2
six               

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
train_dat = pd.read_csv("../data/train.csv",index_col='Loan_ID')
test_dat = pd.read_csv("../data/test.csv",index_col='Loan_ID')

In [4]:
train_dat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [5]:
train_dat.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
train_dat.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
y = train_dat.Loan_Status.map({'Y':1,'N':0})
train_dat.drop('Loan_Status',axis=1,inplace=True)

In [8]:
row_train_dat = train_dat.shape[0]
X = train_dat
X.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [9]:
objList = X.select_dtypes(include = "object").columns
for obj_Column in objList:
    print(X[obj_Column].unique())

['Male' 'Female' nan]
['No' 'Yes' nan]
['0' '1' '2' '3+' nan]
['Graduate' 'Not Graduate']
['No' 'Yes' nan]
['Urban' 'Rural' 'Semiurban']


### Selective Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
encoders = dict()

for col_name in X.columns:
        series = X[col_name]
        label_encoder = LabelEncoder()
        X[col_name] = pd.Series(
            label_encoder.fit_transform(series[series.notnull()]),
            index=series[series.notnull()].index
        )
        encoders[col_name] = label_encoder

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    float64
 1   Married            611 non-null    float64
 2   Dependents         599 non-null    float64
 3   Education          614 non-null    int64  
 4   Self_Employed      582 non-null    float64
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    int64  
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    int64  
dtypes: float64(7), int64(4)
memory usage: 73.7+ KB


In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

X=pd.DataFrame(X)

In [12]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.0,0.0,0.0,0.0,376.0,0.0,81.0,8.0,1.0,2.0
1,1.0,1.0,1.0,0.0,0.0,306.0,60.0,81.0,8.0,1.0,0.0
2,1.0,1.0,0.0,0.0,1.0,139.0,0.0,26.0,8.0,1.0,2.0
3,1.0,1.0,0.0,1.0,0.0,90.0,160.0,73.0,8.0,1.0,2.0
4,1.0,0.0,0.0,0.0,0.0,381.0,0.0,94.0,8.0,1.0,2.0


In [13]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

### training and test split

In [14]:
train_X = X.iloc[:row_train_dat,]
final_test = X.iloc[:row_train_dat:,]
seed=7
train_X.columns = test_dat.columns
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(train_X,y,random_state=seed)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models=[]
models.append(("logreg",LogisticRegression()))
models.append(("tree",DecisionTreeClassifier()))
models.append(("svc",SVC()))
models.append(("rndf",RandomForestClassifier()))

from sklearn.model_selection import cross_val_score
result=[]
names=[]

In [16]:
for name,model in models:
    cv_res = cross_val_score(model,train_X,train_y,scoring='accuracy',cv=10)
    result.append(cv_res.mean())
    names.append(name)

for name,res in zip(names,result):
    print(name," ",res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logreg   0.8043478260869567
tree   0.7
svc   0.6869565217391302
rndf   0.7782608695652175


### ada boosting

In [20]:
from sklearn.ensemble import AdaBoostClassifier
#estimator list something
check = [20,30,40,45,50,55,60,70,100,150]
for est in check:
    ada_clf = AdaBoostClassifier(
        RandomForestClassifier(max_depth=10,min_samples_leaf=28),
        n_estimators=est,algorithm="SAMME",
        learning_rate=0.25
    )
    ada_clf.fit(train_X,train_y)
    print("Score for est {} train data {}".format(est,ada_clf.score(train_X,train_y)))
    print("testdata{}".format(ada_clf.score(test_X,test_y)))

Score for est 20 train data 0.8847826086956522
testdata0.7987012987012987
Score for est 30 train data 0.941304347826087
testdata0.8051948051948052
Score for est 40 train data 0.9630434782608696
testdata0.8051948051948052
Score for est 45 train data 0.9760869565217392
testdata0.7987012987012987
Score for est 50 train data 0.9847826086956522
testdata0.8051948051948052
Score for est 55 train data 0.9847826086956522
testdata0.7857142857142857
Score for est 60 train data 0.9826086956521739
testdata0.7922077922077922
Score for est 70 train data 0.991304347826087
testdata0.8116883116883117
Score for est 100 train data 0.9978260869565218
testdata0.8181818181818182
Score for est 150 train data 0.9978260869565218
testdata0.7922077922077922


In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

new_pred = ada_clf.predict(test_X)
print(accuracy_score(test_y,new_pred))
print(confusion_matrix(test_y,new_pred))

0.7922077922077922
[[26 22]
 [10 96]]


In [22]:
import pickle as pkl
pkl.dump(ada_clf,open("../model/adaboost_model.pkl",'wb'))
