In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("E:\Personal_Documents\Big_Data_Syllabus\Projects_For_Skill_Development\Complete_MLOPS\Packaging-ML-Model\loan_approval_dataset.csv")
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
df.shape

(4269, 13)

In [4]:
df.nunique()

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

In [5]:
# duplicates
df.duplicated().sum()

0

In [6]:
#Dropping the loan_id column
df.drop(["loan_id"], axis=1, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0    no_of_dependents          4269 non-null   int64 
 1    education                 4269 non-null   object
 2    self_employed             4269 non-null   object
 3    income_annum              4269 non-null   int64 
 4    loan_amount               4269 non-null   int64 
 5    loan_term                 4269 non-null   int64 
 6    cibil_score               4269 non-null   int64 
 7    residential_assets_value  4269 non-null   int64 
 8    commercial_assets_value   4269 non-null   int64 
 9    luxury_assets_value       4269 non-null   int64 
 10   bank_asset_value          4269 non-null   int64 
 11   loan_status               4269 non-null   object
dtypes: int64(9), object(3)
memory usage: 400.3+ KB


In [8]:
df.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [9]:
#Removing the white spaces in the column names
df.columns = [c.strip() for c in df.columns]
df.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [10]:
#Checking for the presence of Null values
df.isna().sum()

no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [11]:
#Splitting the dependent and independent variables into X and Y
X = df.drop(["loan_status"], axis=1)
y = df["loan_status"]

In [12]:
#Checking the info of X dataset
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   no_of_dependents          4269 non-null   int64 
 1   education                 4269 non-null   object
 2   self_employed             4269 non-null   object
 3   income_annum              4269 non-null   int64 
 4   loan_amount               4269 non-null   int64 
 5   loan_term                 4269 non-null   int64 
 6   cibil_score               4269 non-null   int64 
 7   residential_assets_value  4269 non-null   int64 
 8   commercial_assets_value   4269 non-null   int64 
 9   luxury_assets_value       4269 non-null   int64 
 10  bank_asset_value          4269 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 367.0+ KB


In [13]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4269 entries, 0 to 4268
Series name: loan_status
Non-Null Count  Dtype 
--------------  ----- 
4269 non-null   object
dtypes: object(1)
memory usage: 33.5+ KB


In [14]:
#Applying domain knowledge as a part of data pre-processing
X["total_assets_value"] = X["residential_assets_value"] + X["commercial_assets_value"] + X["luxury_assets_value"] + X["bank_asset_value"]

#Drop the columns
X.drop(["residential_assets_value","commercial_assets_value","luxury_assets_value","bank_asset_value"], axis=1, inplace=True)

In [15]:
# Display the first few rows
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,Graduate,No,9600000,29900000,12,778,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,17000000
2,3,Graduate,No,9100000,29700000,20,506,57700000
3,3,Graduate,No,8200000,30700000,8,467,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,55000000


In [16]:
X["education"].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [17]:
X["self_employed"].unique()

array([' No', ' Yes'], dtype=object)

In [18]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

In [19]:
#Removing the white spaces from the columns
X["education"] = X["education"].str.strip()
X["self_employed"] = X["self_employed"].str.strip()
y = y.str.strip()

In [20]:
X["education"].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [21]:
# Convert the categorical columns into binary format
def transform_categorical_to_binary(df, columns_to_transform):
    """"
    Transforms specified categorical columns in a Dataframe to binary (0 and 1).
    
    Args: 
        df: Dataframe to modify
        columns_to_transform: A dictionary where keys are column names and values are lists of the original categorical 
                              values to map to 0 and 1.
    Returns: 
        The modified dataframe with transformed columns.
    """
    
    for column_name, positive_values in columns_to_transform.items():
        df[column_name] = df[column_name].apply(lambda x : 1 if x in positive_values else 0)
        
    return df

columns_to_transform = {'education':["Graduate"], "self_employed":['Yes']}

transformed_X = transform_categorical_to_binary(X, columns_to_transform)
transformed_X

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [22]:
#Applying log transformation for scaling the numerical columns

log_cols = ["income_annum","loan_amount","total_assets_value"]
X[log_cols] = np.log(X[log_cols])

In [23]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,16.077274,17.213369,12,778,17.741436
1,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,1,0,16.023785,17.206658,20,506,17.870768
3,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,0,1,16.097893,17.001863,20,382,17.822844


In [24]:
#Converting the target values into binary as well
y = y.map({"Approved":1,"Rejected":0})

In [25]:
#checking the y-values
y[:5]

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [26]:
#Splitting the X and y into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2988, 8), (1281, 8), (2988,), (1281,))

In [27]:
#Building the model
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter=10000)
log.fit(X_train, y_train)

In [28]:
y_pred_test = log.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test)
print(f"Accuracy is: {acc}")

Accuracy is: 0.9149102263856362


# Serialization and Deserialization

In [30]:
import joblib
joblib.dump(log, "my_trained_model_v1.pkl")

['my_trained_model_v1.pkl']

In [31]:
#deserialization
final_model = joblib.load("my_trained_model_v1.pkl")

In [32]:
final_model.intercept_, final_model.coef_

(array([-12.37363776]),
 array([[-0.00720354,  0.03553901, -0.00388703, -2.73495718,  2.26909398,
         -0.15390255,  0.02453598,  0.34029275]]))

In [33]:
log.intercept_, log.coef_

(array([-12.37363776]),
 array([[-0.00720354,  0.03553901, -0.00388703, -2.73495718,  2.26909398,
         -0.15390255,  0.02453598,  0.34029275]]))

### Checking the directories

In [34]:
import sys
sys.path

['E:\\Personal_Documents\\Big_Data_Syllabus\\Projects_For_Skill_Development\\Complete_MLOPS\\Packaging-ML-Model\\Experiments',
 'C:\\Users\\Dhruv\\anaconda3\\python38.zip',
 'C:\\Users\\Dhruv\\anaconda3\\DLLs',
 'C:\\Users\\Dhruv\\anaconda3\\lib',
 'C:\\Users\\Dhruv\\anaconda3',
 '',
 'C:\\Users\\Dhruv\\AppData\\Roaming\\Python\\Python38\\site-packages',
 'C:\\Users\\Dhruv\\anaconda3\\lib\\site-packages',
 'C:\\Users\\Dhruv\\anaconda3\\lib\\site-packages\\win32',
 'C:\\Users\\Dhruv\\anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\Users\\Dhruv\\anaconda3\\lib\\site-packages\\Pythonwin']

In [None]:
# To add any new path to the sys directory we can use below command
#sys.path.append()