In [4]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

In [5]:
raw_train=pd.read_csv("../packaging_ml_model/prediction_model/datasets/train.csv")
raw_test=pd.read_csv("../packaging_ml_model/prediction_model/datasets/test.csv")

In [6]:
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
raw_train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [8]:
raw_train.shape

(614, 13)

In [9]:
train_df=raw_train.copy()
test_df=raw_test.copy()

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [11]:
# Can For Predicting Loan Granted Or Not
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [12]:
train_y=train_df['Loan_Status'].copy()

In [13]:
train_y.head()

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

In [14]:
train_df.drop(columns=['Loan_Status'],inplace=True)

In [15]:
train_df.sample(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
64,LP001222,Female,No,0,Graduate,No,4166,0.0,116.0,360.0,0.0,Semiurban
214,LP001716,Male,Yes,0,Graduate,No,3173,3021.0,137.0,360.0,1.0,Urban


In [16]:
# Dropping Unnecessary Columns
train_df.drop(columns=['Loan_ID'],inplace=True)
test_df.drop(columns=['Loan_ID'],inplace=True)

In [17]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [18]:
train_df.duplicated().sum()
# Duplicates --> no duplicates

0

In [19]:
# Findiing Duplicates and removing 
test_df.duplicated().sum()

1

In [20]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [21]:
test_df.drop_duplicates(inplace=True)

In [22]:
test_df.duplicated().sum()

0

In [23]:
# Handling Missing values

In [24]:
train_df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [25]:
# Imputaion for missing values
## Numeric --> mean 
## Categorical --> mode 

In [26]:
num_cols= [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
           'Loan_Amount_Term' ]

In [27]:
cat_cols= ['Gender', 'Married', 'Dependents','Education',
           'Self_Employed', 'Credit_History','Property_Area' ]

In [28]:
# categorical imputation using mode
cat_imputer= SimpleImputer(strategy='most_frequent')
cat_imputer.fit(train_df[cat_cols])

train_df[cat_cols]=cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols]=cat_imputer.transform(test_df[cat_cols])

In [29]:
#  numerical imputation using mean
num_imputer= SimpleImputer(strategy='mean')
num_imputer.fit(train_df[num_cols])

train_df[num_cols]=num_imputer.transform(train_df[num_cols])
test_df[num_cols]=num_imputer.transform(test_df[num_cols])

In [30]:
train_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [31]:
# PreProcessing as per Domain Knowledge
train_df['ApplicantIncome']+=train_df['CoapplicantIncome']
test_df['ApplicantIncome']+=test_df['CoapplicantIncome']

#droping the co-applicant income column
train_df.drop(columns='CoapplicantIncome',inplace=True)
test_df.drop(columns='CoapplicantIncome',inplace=True)

In [32]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,6091.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,4941.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,141.0,360.0,1.0,Urban


In [33]:
# Application of Label Encoding

In [34]:
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
ApplicantIncome     554
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
dtype: int64

In [35]:
train_df.Dependents.unique() # Ordinal data --> Label Encoder

array(['0', '1', '2', '3+'], dtype=object)

In [36]:
for col in cat_cols:
    le=LabelEncoder()
    train_df[col]=le.fit_transform(train_df[col])
    test_df[col]=le.fit_transform(test_df[col])

In [37]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [38]:
num_cols.remove('CoapplicantIncome')

In [39]:
# Transforming numerical columns
train_df[num_cols]=np.log(train_df[num_cols])
test_df[num_cols]=np.log(test_df[num_cols])

In [40]:
train_df[num_cols].head()

Unnamed: 0,ApplicantIncome,LoanAmount,Loan_Amount_Term
0,8.674026,4.986426,5.886104
1,8.714568,4.85203,5.886104
2,8.006368,4.189655,5.886104
3,8.505323,4.787492,5.886104
4,8.699515,4.94876,5.886104


In [41]:
#MinMax Scaling

minmax=MinMaxScaler()
train_df=minmax.fit_transform(train_df)
test_df=minmax.fit_transform(test_df)

In [42]:
train_df

array([[1.        , 0.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.66666667, ..., 0.9220137 , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.9220137 , 0.        ,
        0.5       ]])

In [43]:
# Model Building

In [44]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test=train_test_split(train_df,train_y, test_size=0.3, random_state=0)

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
log_model=LogisticRegression()
log_model.fit(X_train,y_train)

In [47]:
y_pred_test=log_model.predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,y_pred_test)
print(f'Accuracy is {acc}')

Accuracy is 0.827027027027027


In [49]:
# import joblib

In [50]:
# #serialization
# joblib.dump(log_model,'my_trained_model_v1.pkl')

In [51]:
# #deserialization
# final_model=joblib.load('my_trained_model_v1.pkl')

In [52]:
# final_model

In [53]:
# to create data processing pipeline it must contain fit and transform 
# therefore sometimes we have to create custom data transformer

# Create Custom Data Transformers

In [54]:
# key thing --> Inherit - BaseEstimator, TransformerMixin
# implement fitn and transform
# accept input with __init__ method

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin
class DemoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X

In [56]:
#Numerical Imputation - mean

In [57]:
class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self,variables=None):
        self.variables=variables
        
    def fit(self,X,y=None):
        self.mean_dict={}
        for col in self.variables:
            self.mean_dict[col]=X[col].mean()
        return self
    
    def transform(self,X):
        X=X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col],inplace=True)
        return X
    

In [59]:
np.random.seed(0)
df=pd.DataFrame(np.random.randint(0,100,(10,2)),columns=['A',
                                                        'B'])
df.iloc[1,0]=np.nan
df.iloc[2,1]=np.nan                      
df.iloc[3,1]=np.nan
df.iloc[4,0]=np.nan
df

Unnamed: 0,A,B
0,44.0,47.0
1,,67.0
2,67.0,
3,83.0,
4,,87.0
5,70.0,88.0
6,88.0,12.0
7,58.0,65.0
8,39.0,87.0
9,46.0,88.0


In [60]:
mean_imputer=MeanImputer(variables=['A','B'])
mean_imputer.fit(df)

In [61]:
mean_imputer.mean_dict

{'A': 61.875, 'B': 67.625}

In [62]:
mean_imputer.transform(df)

Unnamed: 0,A,B
0,44.0,47.0
1,61.875,67.0
2,67.0,67.625
3,83.0,67.625
4,61.875,87.0
5,70.0,88.0
6,88.0,12.0
7,58.0,65.0
8,39.0,87.0
9,46.0,88.0


In [1]:
import numpy
numpy.__version__

'1.23.5'

In [2]:
import pandas as pd
pd.__version__

'2.0.2'

In [3]:
import joblib
joblib.__version__

'1.2.0'

In [4]:
import sklearn
sklearn.__version__

'1.2.2'

In [5]:
import scipy
scipy.__version__

'1.10.1'

In [6]:
import setuptools
setuptools.__version__

'65.5.0'

In [7]:
import wheel
wheel.__version__

'0.40.0'