In [101]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [102]:
dtrain = pd.read_csv(r"MLPackages/datasets/train.csv")
dtest = pd.read_csv(r"MLPackages/datasets/test.csv")

In [103]:
dtrain.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [104]:
dt1  = dtrain.copy()
dt2 = dtest.copy()

In [105]:
print(dt1.shape)
print()
print(dt2.shape)

(614, 13)

(362, 12)


In [106]:
dt1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [107]:
dt2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [108]:
# Drop unwanted columns
dt1_target = dt1['Loan_Status']
dt1.drop(columns = ['Loan_ID', 'Loan_Status'], inplace = True)
dt2.drop(columns = ['Loan_ID'], inplace=True)

In [109]:
print(dt1.shape)
print()
print(dt2.shape)

(614, 11)

(362, 11)


In [110]:
# Check for duplicates
print(dt1.duplicated().sum())
print(dt2.duplicated().sum())

0
1


In [111]:
dt2[dt2.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [112]:
dt2.drop_duplicates(inplace=True)

In [113]:
# Check for duplicates again
print(dt1.duplicated().sum())
print(dt2.duplicated().sum())

0
0


In [114]:
# Check for missing values
print(f"Null values in Train dataset: \n{dt1.isnull().sum().sort_values(ascending=False)}")
print()
print(f"Null values in Test dataset: \n{dt2.isnull().sum().sort_values(ascending=False)}")

Null values in Train dataset: 
Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
dtype: int64

Null values in Test dataset: 
Credit_History       29
Self_Employed        23
Gender               11
Dependents            9
Loan_Amount_Term      6
Married               0
Education             0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Property_Area         0
dtype: int64


In [115]:
dt1.nunique().sort_values(ascending=False)

ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Dependents             4
Property_Area          3
Gender                 2
Married                2
Education              2
Self_Employed          2
Credit_History         2
dtype: int64

In [116]:
num_cols_dt1 = dt1.select_dtypes(include=['number']).columns
num_cols_dt1 = [col for col in num_cols_dt1 if col != 'Credit_History']
print(num_cols_dt1)
print()
cat_cols_dt1 = dt1.select_dtypes(include=['object']).columns
cat_cols_dt1 = list(cat_cols_dt1) + ['Credit_History']
print(cat_cols_dt1)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Credit_History']


In [117]:
# For numeric
num_imputer = IterativeImputer()
dt1[num_cols_dt1] = num_imputer.fit_transform(dt1[num_cols_dt1])

# For categorical
simp_imp = SimpleImputer(strategy='most_frequent')
dt1[cat_cols_dt1] = simp_imp.fit_transform(dt1[cat_cols_dt1])

In [118]:
# dt1.isnull().sum()
# None for dt1

In [119]:
num_cols_dt2 = dt2.select_dtypes(include=['number']).columns
num_cols_dt2 = [col for col in num_cols_dt2 if col != 'Credit_History']
num_cols_dt2 = list(num_cols_dt2)
print(num_cols_dt2)
print()
cat_cols_dt2 = dt2.select_dtypes(include=['object']).columns
cat_cols_dt2 = list(cat_cols_dt1)
print(cat_cols_dt2)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Credit_History']


In [120]:
# For numeric
num_imputer = IterativeImputer()
dt2[num_cols_dt2] = num_imputer.fit_transform(dt2[num_cols_dt2])

# For categorical
simp_imp = SimpleImputer(strategy='most_frequent')
dt2[cat_cols_dt2] = simp_imp.fit_transform(dt2[cat_cols_dt2])

In [121]:
# dt2.isnull().sum()
# None for dt2

In [122]:
dt1['ApplicantIncome'] = dt1['ApplicantIncome'] + dt1['CoapplicantIncome']
dt2['ApplicantIncome'] = dt2['ApplicantIncome'] + dt2['CoapplicantIncome']

dt1.drop(columns = ('CoapplicantIncome'), inplace=True)
dt2.drop(columns = ('CoapplicantIncome'), inplace=True)

In [123]:
dt2.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720.0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4576.0,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,6800.0,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,4886.0,100.0,360.0,1.0,Urban
4,Male,No,0,Not Graduate,No,3276.0,78.0,360.0,1.0,Urban


In [124]:
le = LabelEncoder()
cat_cols = dt1.select_dtypes(include=['object']).columns
num_cols = dt1.select_dtypes(include=['number']).columns

for col in cat_cols:
    dt1[col] = le.fit_transform(dt1[col])
    dt2[col] = le.fit_transform(dt2[col])    

In [125]:
num_cols

Index(['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], dtype='object')

In [126]:
dt1.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,138.159244,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [127]:
print(dt1.skew())
print()
print(dt2.skew())

Gender             -1.648795
Married            -0.644850
Dependents          1.015551
Education           1.367622
Self_Employed       2.159796
ApplicantIncome     5.633449
LoanAmount          2.712439
Loan_Amount_Term   -2.390169
Credit_History     -2.021971
Property_Area      -0.066196
dtype: float64

Gender             -1.577609
Married            -0.547824
Dependents          0.922147
Education           1.289091
Self_Employed       2.682967
ApplicantIncome     7.068664
LoanAmount          2.218821
Loan_Amount_Term   -2.731807
Credit_History     -1.855843
Property_Area      -0.139916
dtype: float64


In [128]:
dt1[num_cols] = np.log(dt1[num_cols])
dt2[num_cols] = np.log(dt2[num_cols])

In [129]:
minmax = MinMaxScaler()
dt1[num_cols] = minmax.fit_transform(dt1[num_cols])
dt2[num_cols] = minmax.fit_transform(dt2[num_cols])

In [130]:
print(dt1.skew())
print()
print(dt2.skew())

Gender             -1.648795
Married            -0.644850
Dependents          1.015551
Education           1.367622
Self_Employed       2.159796
ApplicantIncome     1.076702
LoanAmount         -0.204429
Loan_Amount_Term   -4.868401
Credit_History     -2.021971
Property_Area      -0.066196
dtype: float64

Gender             -1.577609
Married            -0.547824
Dependents          0.922147
Education           1.289091
Self_Employed       2.682967
ApplicantIncome     1.232636
LoanAmount         -0.250101
Loan_Amount_Term   -6.308355
Credit_History     -1.855843
Property_Area      -0.139916
dtype: float64


In [131]:
dt1_target.value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [132]:
dt1_target.replace({'Y': 1, 'N': 0}, inplace =True)

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dt1, dt1_target, test_size=0.3)

In [134]:
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
def evaluate_model_performance(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    recall_train = recall_score(y_train, y_pred_train)
    precision_train = precision_score(y_train, y_pred_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)

    y_pred_test = model.predict(X_test)
    recall_test = recall_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)

    print(f"Results for {model.__class__.__name__}:")
    print(f"Train Accuracy: {round(accuracy_train, 4) * 100}%")
    print(f"Test Accuracy: {round(accuracy_test, 4) * 100}%")

    print(f"Train Recall: {round(recall_train, 4) * 100}%")
    print(f"Test Recall: {round(recall_test, 4) * 100}%")

    print(f"Train Precision: {round(precision_train, 4) * 100}%")
    print(f"Test Precision: {round(precision_test, 4) * 100}%")

    print(f"Train F1 Score: {round(f1_train, 4) * 100}%")
    print(f"Test F1 Score: {round(f1_test, 4) * 100}%")

In [135]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
evaluate_model_performance(model, X_train, y_train, X_test, y_test)

Results for LogisticRegression:
Train Accuracy: 82.05%
Test Accuracy: 78.38000000000001%
Train Recall: 98.65%
Test Recall: 97.6%
Train Precision: 80.05%
Test Precision: 76.73%
Train F1 Score: 88.39%
Test F1 Score: 85.92%


In [136]:
# import joblib
# joblib.dump(model, "trained_model_loan_exp_LR.pkl")

In [137]:
X_train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')

In [138]:
num_cols

Index(['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], dtype='object')

In [139]:
cat_cols

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Credit_History', 'Property_Area'],
      dtype='object')

In [140]:
from sklearn.base import BaseEstimator, TransformerMixin

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables = None):
        self.variables = variables
        
    def fit(self, X, y= None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col], inplace = True)
        return X 

In [141]:
np.random.seed(0)
data = pd.DataFrame(np.random.rand(100, 5), columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'])

# Introduce missing values
missing_indices = np.random.choice(data.index, size=20, replace=False)
data.loc[missing_indices, 'Feature1'] = np.nan
data.loc[missing_indices, 'Feature2'] = np.nan

In [142]:
data.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5
0,0.548814,0.715189,0.602763,0.544883,0.423655
1,0.645894,0.437587,0.891773,0.963663,0.383442
2,0.791725,0.528895,0.568045,0.925597,0.071036
3,,,0.83262,0.778157,0.870012
4,0.978618,0.799159,0.461479,0.780529,0.118274


In [143]:
missing_cols = [cols for cols in data.columns if data[cols].isnull().any()]
print(missing_cols)

['Feature1', 'Feature2']


In [144]:
mean_imputer = MeanImputer(variables=missing_cols)
mean_imputer.fit(data)

In [145]:
mean_imputer.mean_dict

{'Feature1': 0.4812566975663731, 'Feature2': 0.5128573075390306}

In [146]:
new_data = mean_imputer.transform(data)

In [147]:
from MLPackages.processing.preprocessing import DropColumns
cols = ['Feature4', 'Feature5']  
cols2drop = DropColumns(variables_to_drop=cols)
data_transformed = cols2drop.transform(new_data)

In [148]:
data_transformed

Unnamed: 0,Feature1,Feature2,Feature3
0,0.548814,0.715189,0.602763
1,0.645894,0.437587,0.891773
2,0.791725,0.528895,0.568045
3,0.481257,0.512857,0.832620
4,0.978618,0.799159,0.461479
...,...,...,...
95,0.750765,0.699575,0.967966
96,0.070870,0.292794,0.152355
97,0.481257,0.512857,0.895386
98,0.274824,0.592230,0.896761


In [149]:
np.random.seed(0)
data = pd.DataFrame(np.random.rand(1000, 7), columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7'])

# Introduce missing values
missing_indices = np.random.choice(data.index, size=20, replace=False)
data.loc[missing_indices, 'Feature1'] = np.nan
data.loc[missing_indices, 'Feature2'] = np.nan

In [150]:
from MLPackages.processing.preprocessing import DropColumns as DC
data.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7
0,,,0.602763,0.544883,0.423655,0.645894,0.437587
1,0.891773,0.963663,0.383442,0.791725,0.528895,0.568045,0.925597
2,0.071036,0.087129,0.020218,0.83262,0.778157,0.870012,0.978618
3,0.799159,0.461479,0.780529,0.118274,0.639921,0.143353,0.944669
4,0.521848,0.414662,0.264556,0.774234,0.45615,0.568434,0.01879


In [151]:
cols = ['Feature7']
drop = DC(variables_to_drop=cols)
new_data = drop.transform(data)

In [152]:
new_data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6
0,,,0.602763,0.544883,0.423655,0.645894
1,0.891773,0.963663,0.383442,0.791725,0.528895,0.568045
2,0.071036,0.087129,0.020218,0.832620,0.778157,0.870012
3,0.799159,0.461479,0.780529,0.118274,0.639921,0.143353
4,0.521848,0.414662,0.264556,0.774234,0.456150,0.568434
...,...,...,...,...,...,...
995,0.574923,0.138094,0.306970,0.552212,0.401260,0.715995
996,0.470724,0.821046,0.142769,0.894928,0.447246,0.479840
997,0.541834,0.823250,0.186640,0.129905,0.263485,0.997813
998,0.530452,0.773381,0.832596,0.957425,0.995167,0.552256


In [153]:
import os
import pathlib
import MLPackages

path1 = os.path.dirname(MLPackages.__file__)
# print(path1)
path2 = os.path.join(path1, 'classification.pkl')
print(path2)

d:\MLOps\MLOps\MLPackages\classification.pkl


In [154]:
npath1 = os.path.dirname(MLPackages.__file__)
npath2 = os.path.join(npath1, 'trained_models')
npath3 = os.path.join(npath2, 'trained_models', 'classification_model.pkl')
print(npath2)

d:\MLOps\MLOps\MLPackages\trained_models


In [155]:
from MLPackages.config import config
a = config.SAVED_MODEL_PATH
b = config.MODEL_SAVED
print(b)
print(a)

classification_model.pkl
d:\MLOps\MLOps\MLPackages\trained_models


In [156]:
npath1 = os.path.dirname(MLPackages.__file__)
MODEL_SAVED = "classification_model.pkl"
SAVED_MODEL_PATH = os.path.join(npath1, 'trained_models', MODEL_SAVED)
print(SAVED_MODEL_PATH)

d:\MLOps\MLOps\MLPackages\trained_models\classification_model.pkl


In [157]:
npath1 = os.path.dirname(MLPackages.__file__)
save_path = os.path.join(npath1, 'trained_models', config.MODEL_SAVED)
print(save_path)

d:\MLOps\MLOps\MLPackages\trained_models\classification_model.pkl


In [158]:
ROOT_PACKAGES = pathlib.Path(MLPackages.__file__).resolve().parent
SAVED_MODEL_PATH = os.path.join(ROOT_PACKAGES, "trained_models")
print(SAVED_MODEL_PATH)

D:\MLOps\MLOps\MLPackages\trained_models


In [159]:
new_path = pathlib.Path(MLPackages.__file__).resolve().parent
SAVED_MODEL_PATH = os.path.join(new_path, "trained_models", config.MODEL_SAVED)
print(SAVED_MODEL_PATH)

D:\MLOps\MLOps\MLPackages\trained_models\classification_model.pkl


In [160]:
# ValueError: mode must be one of ['r', 'c', 'r+', 'w+', 'readonly', 'copyonwrite', 'readwrite', 'write']
import joblib
npath1 = os.path.dirname(MLPackages.__file__)
save_path = os.path.join(npath1, 'trained_models', config.MODEL_SAVED)
model_loaded = joblib.load(save_path)
print(model_loaded)

# Removed the parameter from the data_handling.load_pipeline

Pipeline(steps=[('MeanImputation',
                 MeanImputer(variables=['ApplicantIncome', 'LoanAmount',
                                        'Loan_Amount_Term'])),
                ('ModeImputation',
                 ModeImputer(variables=['Gender', 'Married', 'Dependents',
                                        'Education', 'Self_Employed',
                                        'Credit_History', 'Property_Area'])),
                ('DomainProcessing',
                 DomainProcessing(variables_to_add='CoapplicantIncome',
                                  variables_to_modify=['Applica...
                 DropColumns(variables_to_drop=['CoapplicantIncome'])),
                ('LabelEncoder',
                 CustomLabelEncoder(variables=['Gender', 'Married',
                                               'Dependents', 'Education',
                                               'Self_Employed',
                                               'Credit_History',
                   

In [161]:
# path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'MLPackages'))
cwd = os.getcwd()
# path1 = os.path.dirname(__file__)
print(cwd)

d:\MLOps\MLOps


In [164]:
import sys
p1 = os.getcwd()
p2 = os.path.join(p1, 'MLPackages')
p3 = os.path.join(p2)
print(p1)
print(p2)
sys.path.append(p1)
sys.path.append(p2) 

d:\MLOps\MLOps
d:\MLOps\MLOps\MLPackages


In [165]:
MLPackages.__version__

AttributeError: module 'MLPackages' has no attribute '__version__'