In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency,f_oneway
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report,r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
import os
from pathlib import Path
import seaborn as sns
import json
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from sklearn.model_selection import RandomizedSearchCV

In [140]:
Path.cwd()

WindowsPath('c:/Users/dhiraj_choudhary/Music/Credit-Risk-Modelling/notebooks')

In [2]:
# Go to base folder of the project from the current file deitory- one lave up
root_folder_path=Path.cwd().parent
root_folder_path

WindowsPath('c:/Users/dhiraj_choudhary/Music/Credit-Risk-Modelling')

In [3]:
#Go to Raw data folder
raw_data_folder=root_folder_path/'data'/'raw_data'
raw_data_folder

WindowsPath('c:/Users/dhiraj_choudhary/Music/Credit-Risk-Modelling/data/raw_data')

In [4]:
# read the internal data as well as credit bearue one
internal_data_cust=pd.read_excel(raw_data_folder/'Internal_data_Bank.xlsx')
cibil_data_cust=pd.read_excel(raw_data_folder/'Cibil_Data_External.xlsx')

In [7]:
#utills file
#create the function to remove df_internal_data_cust
def null_removal_df_internal_data_cust(internal_data_cust):
    df_internal_data_cust_final=internal_data_cust.loc[internal_data_cust['Age_Oldest_TL'] != -99999]
    return df_internal_data_cust_final

#create function for null removal of df_cibil_data_cust

def null_removal_df_cibil_data_cust(cibil_data_cust):
    threshold = 20 
    # Loop through all columns and drop those with >20% -99999 values
    columns_to_drop = []
    for col in cibil_data_cust.columns:
        null_percentage = (cibil_data_cust[col] == -99999).sum() / len(cibil_data_cust) * 100
        if null_percentage > threshold:
            columns_to_drop.append(col)
    # Drop the identified columns
    cibil_data_cust.drop(columns=columns_to_drop, inplace=True)
    # Step 2: Drop rows where any remaining column contains -99999
    df_cibil_data_cust_final = cibil_data_cust[cibil_data_cust.ne(-99999).all(axis=1)]
    return df_cibil_data_cust_final


In [8]:
df_internal_data_cust=null_removal_df_internal_data_cust(internal_data_cust)
df_cibil_data_cust=null_removal_df_cibil_data_cust(cibil_data_cust)

In [9]:
((df_internal_data_cust==-99999).sum()/len(df_internal_data_cust)*100).sort_values(ascending=False)

PROSPECTID              0.0
Total_TL                0.0
Age_Oldest_TL           0.0
Other_TL                0.0
Unsecured_TL            0.0
Secured_TL              0.0
PL_TL                   0.0
Home_TL                 0.0
Gold_TL                 0.0
Consumer_TL             0.0
CC_TL                   0.0
Auto_TL                 0.0
Tot_Missed_Pmnt         0.0
pct_tl_closed_L12M      0.0
pct_tl_open_L12M        0.0
Tot_TL_closed_L12M      0.0
Total_TL_opened_L12M    0.0
pct_closed_tl           0.0
pct_active_tl           0.0
pct_tl_closed_L6M       0.0
pct_tl_open_L6M         0.0
Tot_TL_closed_L6M       0.0
Total_TL_opened_L6M     0.0
Tot_Active_TL           0.0
Tot_Closed_TL           0.0
Age_Newest_TL           0.0
dtype: float64

In [10]:
((df_cibil_data_cust==-99999).sum()/len(df_cibil_data_cust)*100).sort_values(ascending=False)

PROSPECTID                    0.0
pct_opened_TLs_L6m_of_L12m    0.0
time_since_recent_enq         0.0
enq_L12m                      0.0
enq_L6m                       0.0
enq_L3m                       0.0
MARITALSTATUS                 0.0
EDUCATION                     0.0
AGE                           0.0
GENDER                        0.0
NETMONTHLYINCOME              0.0
Time_With_Curr_Empr           0.0
pct_of_active_TLs_ever        0.0
pct_currentBal_all_TL         0.0
time_since_recent_payment     0.0
CC_Flag                       0.0
PL_Flag                       0.0
pct_PL_enq_L6m_of_L12m        0.0
pct_CC_enq_L6m_of_L12m        0.0
pct_PL_enq_L6m_of_ever        0.0
pct_CC_enq_L6m_of_ever        0.0
HL_Flag                       0.0
GL_Flag                       0.0
last_prod_enq2                0.0
first_prod_enq2               0.0
Credit_Score                  0.0
PL_enq_L12m                   0.0
PL_enq_L6m                    0.0
PL_enq                        0.0
CC_enq_L12m   

In [11]:

# Merge the two dataframes, inner join so that no nulls are present
def merge_df(df1,df2):
     df = pd. merge ( df1, df2, how ='inner', left_on = ['PROSPECTID'], right_on = ['PROSPECTID'] )
     return df

In [12]:
df_final_data_cust=merge_df(df_internal_data_cust,df_cibil_data_cust)

In [13]:
df_final_data_cust.shape

(42064, 79)

In [16]:
## create train,validation and test datasets
def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Splits a DataFrame into train, validation, and test sets.
    
    Parameters:
        df (pd.DataFrame): The full dataset.
        target_column (str): The name of the target column.
        train_size (float): Proportion of the dataset for training (default: 70%).
        val_size (float): Proportion for validation (default: 15%).
        test_size (float): Proportion for testing (default: 15%).
        random_state (int): Random seed for reproducibility.

    Returns:
        df_train (pd.DataFrame): Training dataset.
        df_validation (pd.DataFrame): Validation dataset.
        df_test (pd.DataFrame): Testing dataset.
    """
    
    # Ensure the split sizes sum to 1
    assert train_size + val_size + test_size == 1, "train, val, and test sizes must sum to 1"
    
    # First, split into train and temp (validation + test)
    df_train, df_temp = train_test_split(df, test_size=(val_size + test_size), random_state=random_state)

    # Then, split temp into validation and test
    df_validation, df_test = train_test_split(df_temp, test_size=(test_size / (val_size + test_size)), random_state=random_state)
    
    return df_train, df_validation, df_test


In [None]:
from sklearn.model_selection import train_test_split

def split_data(df, train_size=0.8, val_size=0.2, random_state=42):
    """
    Splits a DataFrame into training and validation sets.
    
    Parameters:
        df (pd.DataFrame): The full dataset.
        train_size (float): Proportion of the dataset for training (default: 80%).
        val_size (float): Proportion for validation (default: 20%).
        random_state (int): Random seed for reproducibility.

    Returns:
        df_train (pd.DataFrame): Training dataset.
        df_validation (pd.DataFrame): Validation dataset.
    """
    
    # Ensure the split sizes sum to 1
    assert train_size + val_size == 1, "train and validation sizes must sum to 1"
    
    # Split into train and validation
    df_train, df_validation = train_test_split(df, test_size=val_size, random_state=random_state)
    
    return df_train, df_validation

# Example usage
# df_train, df_validation = split_train_validation(df)


In [None]:
df_train,df_validation=split_data(df_final_data_cust,train_size=0.8, val_size=0.2, random_state=42)

In [20]:
df_train.sample(5)

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
1357,1630,4,2,2,0,1,0.0,0.25,0.5,0.5,...,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,ConsumerLoan,677,P2
37291,45501,1,1,0,0,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,others,682,P2
701,842,2,0,2,0,0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0,0,PL,others,684,P2
15696,19126,1,1,0,0,1,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,673,P2
19503,23827,1,1,0,0,0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,others,others,700,P2


In [18]:
df_train.shape,df_validation.shape,df_test.shape

((29444, 79), (6310, 79), (6310, 79))

In [25]:
#### store the data to train_data,test_Data,validation data for later for our feature encoding and model_building
def save_df_dir(df,dir):
    #convert df to csv
    df=df.reset_index(drop=True)
    df.to_csv(dir, index=False)


In [26]:
train_data_file=root_folder_path/'data'/'train_data'/'cust_train_data.csv'
validation_data_file =root_folder_path/'data'/'validation_data'/'cust_val_data.csv'
test_data_file =root_folder_path/'data'/'test_data'/'cust_test_data.csv'
save_df_dir(df_train,train_data_file)
save_df_dir(df_validation,validation_data_file)
save_df_dir(df_test,test_data_file)

In [28]:
#### We willl create one function that will do all the feature selection in one go and return the selected features data freame

def feature_selection_data(df_final_data_cust):
    # Chi-square test
    cat_columns_chi2 = []
    for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
        chi2, pval, _, _ = chi2_contingency(pd.crosstab(df_final_data_cust[i], df_final_data_cust['Approved_Flag']))
        if pval <=0.05:
            cat_columns_chi2.append(i)

    # Apply sequential VIf for nyumerical columns
    # numerical columns list
    numeric_columns = []
    columns_to_be_kept = []
    for i in df_final_data_cust.columns:
        if df_final_data_cust[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
            numeric_columns.append(i)
    vif_data = df_final_data_cust[numeric_columns]
    total_columns = vif_data.shape[1]
    column_index = 0
    for i in range (0,total_columns):  
        vif_value = variance_inflation_factor(vif_data, column_index)
        # print (column_index,'---',vif_value)
        if vif_value <= 6:
            columns_to_be_kept.append( numeric_columns[i] )
            column_index = column_index+1
        else:
            vif_data = vif_data.drop([ numeric_columns[i] ] , axis=1)
    # check Anova for columns_to_be_kept 
    from scipy.stats import f_oneway
    columns_to_be_kept_numerical = []
    for i in columns_to_be_kept:
        a = list(df_final_data_cust[i])  
        b = list(df_final_data_cust['Approved_Flag'])     
        group_P1 = [value for value, group in zip(a, b) if group == 'P1']
        group_P2 = [value for value, group in zip(a, b) if group == 'P2']
        group_P3 = [value for value, group in zip(a, b) if group == 'P3']
        group_P4 = [value for value, group in zip(a, b) if group == 'P4']
        f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)
        if p_value <= 0.05:
            columns_to_be_kept_numerical.append(i)
    #Combine all
    selected_features_list_without_target=columns_to_be_kept_numerical + cat_columns_chi2
    #final_features_df=df_final_data_cust[features_list_without_target + ['Approved_Flag']]
    return selected_features_list_without_target

In [None]:
def save_features_to_json(features_list, dir_path, filename="features.json"):
    """
    Saves a list of features into a JSON file.

    Parameters:
        features_list (list): List of feature names.
        dir_path (str or Path): Directory where the JSON file should be saved.
        filename (str): Name of the JSON file (default: 'features.json').

    Returns:
        None (Saves JSON file in the given directory)
    """
    dir_path = Path(dir_path)
    os.makedirs(dir_path, exist_ok=True)  # Ensure directory exists

    file_path = dir_path / filename  # Full file path

    with open(file_path, "w") as json_file:
        json.dump(features_list, json_file, indent=4)  # Save list as JSON


In [141]:
df_training_cust.shape

(29444, 79)

In [142]:
categorical_cols = df_training_cust.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['MARITALSTATUS',
 'EDUCATION',
 'GENDER',
 'last_prod_enq2',
 'first_prod_enq2',
 'Approved_Flag']

In [47]:
### create features after feature selection
def create_save_features_from_training(train_data_file,dir_path, filename):
    #Read training data
    df_training_cust=pd.read_csv(train_data_file)
    # do the feature selection
    selected_features_list_without_target=feature_selection_data(df_training_cust)
    save_features_to_json(selected_features_list_without_target, dir_path, filename)

In [48]:
train_data_file=train_data_file
dir_path=root_folder_path/'src'/'models'
file_name='features_risk_modeling.json'
create_save_features_from_training(train_data_file,dir_path,file_name)

  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


In [50]:
#load the save features 

def load_features_from_json(file_path):
    """
    Reads a list of features from a JSON file.

    Parameters:
        file_path (str or Path): Path to the JSON file.

    Returns:
        list: List of feature names.
    """
    file_path = Path(file_path)

    if not file_path.exists():
        print(f"Error: File {file_path} not found!")
        return []

    with open(file_path, "r") as json_file:
        features_list = json.load(json_file)  # Load JSON file

    print(f"Features loaded successfully from: {file_path}")
    return features_list


In [51]:
file_path=root_folder_path/'src'/'models'/'features_risk_modeling.json'
features_list=load_features_from_json(file_path)

Features loaded successfully from: c:\Users\dhiraj_choudhary\Music\Credit-Risk-Modelling\src\models\features_risk_modeling.json


In [53]:
len(features_list)

39

In [54]:

features_list

['pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'Tot_TL_closed_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'CC_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'max_recent_level_of_deliq',
 'num_deliq_6_12mts',
 'num_times_60p_dpd',
 'num_std_12mts',
 'num_sub',
 'num_dbt',
 'num_dbt_12mts',
 'recent_level_of_deliq',
 'CC_enq_L12m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L3m',
 'NETMONTHLYINCOME',
 'Time_With_Curr_Empr',
 'CC_Flag',
 'PL_Flag',
 'pct_PL_enq_L6m_of_ever',
 'pct_CC_enq_L6m_of_ever',
 'HL_Flag',
 'GL_Flag',
 'MARITALSTATUS',
 'EDUCATION',
 'GENDER',
 'last_prod_enq2',
 'first_prod_enq2']

In [128]:
#######self modelarize block
df_training_cust=pd.read_csv(train_data_file)
df_training_cust_work_upon=df_training_cust[features_list]

In [133]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Prevent column truncation

In [136]:
os.curdir

'.'

In [137]:
list1=df_training_cust_work_upon.select_dtypes(include=['int64', 'float64']).columns.tolist()
with open('num_features.json', "w") as json_file:
    json.dump(list1, json_file, indent=4)

In [138]:
list2=df_training_cust_work_upon.select_dtypes(include=['object', 'category']).columns.tolist()
with open('cat_features.json', "w") as json_file:
    json.dump(list2, json_file, indent=4)

##### Got the feature list and we need to encode the features now

In training we will use feature list and will cretae the column transformer - and will encode the features and save the classifer
later in training we will us the same classifier and will train the model.

In [104]:
def create_save_feature_encoder_classifier(features_list,train_data_file,dir_path,file_name):
    #train data frame
    df_training_cust=pd.read_csv(train_data_file)
    # select onnly features that we willl work on
    df_training_cust_work_upon=df_training_cust[features_list]
    # Identify numerical columns
    numerical_cols_training_cust = df_training_cust_work_upon.select_dtypes(include=['int64', 'float64']).columns.tolist()
    # Identify categorical columns
    categorical_cols_training_cust = df_training_cust_work_upon.select_dtypes(include=['object', 'category']).columns.tolist()
    # print(categorical_cols_training_cust)
    #ordinal columns
    categorical_cols_training_ordinal = ['EDUCATION']
    categorical_cols_training_cust_non_ordinal = [col for col in categorical_cols_training_cust if col not in categorical_cols_training_ordinal]
    # Define ordinal encoding categories for EDUCATION
    education_categories = [['SSC', '12TH', 'UNDER GRADUATE', 'GRADUATE', 'OTHERS', 'POST-GRADUATE', 'PROFESSIONAL']]

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=education_categories), ['EDUCATION']),  # Ordinal encoding for EDUCATION
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False),categorical_cols_training_cust_non_ordinal),  # One-hot encoding
            ('scaler', StandardScaler(), numerical_cols_training_cust)  # Standardize numerical columns
        ]
    )
    preprocessor.fit(df_training_cust_work_upon)
    
    # Save the pipeline
    dir_path = Path(dir_path)
    os.makedirs(dir_path, exist_ok=True)  # Ensure directory exists

    file_path = dir_path / file_name  # Full file path
    joblib.dump(preprocessor,file_path)

    print(f"Preprocessing pipeline for riskreporting saved succesfully")


#trigger the function
features_list=load_features_from_json(file_path)
train_data_file=train_data_file
dir_path=root_folder_path/'src'/'models'
file_name_preprocessor='preprocessing_pipeline_risk_modelling.pkl'


create_save_feature_encoder_classifier(features_list,train_data_file,dir_path,file_name)

Features loaded successfully from: c:\Users\dhiraj_choudhary\Music\Credit-Risk-Modelling\src\models\features_risk_modeling.json
Preprocessing pipeline for riskreporting saved succesfully


In [91]:
len(features_list),train_data_file,dir_path,file_name

(39,
 WindowsPath('c:/Users/dhiraj_choudhary/Music/Credit-Risk-Modelling/data/train_data/cust_train_data.csv'),
 WindowsPath('c:/Users/dhiraj_choudhary/Music/Credit-Risk-Modelling/src/models'),
 'preprocessing_pipeline_risk_modelling.pkl')

##### Training pipeline and save the model

In [102]:
y_train.value_counts()

Approved_Flag
P2    17895
P3     4465
P4     3713
P1     3371
Name: count, dtype: int64

In [105]:
#Prepare the input features and target variable
train_data_file=root_folder_path/'data'/'train_data'/'cust_train_data.csv'
df_training=pd.read_csv(train_data_file)
X_train=df_training[features_list]
y_train=df_training['Approved_Flag']

In [112]:
df_training.shape

(29444, 79)

In [113]:
df_validation.shape

(6310, 79)

In [114]:
df_test.shape

(6310, 79)

In [106]:
###Target label and 

def save_lable_encode(dir_path,file_name,y_train):
    label_encoder_target=LabelEncoder()
    label_encoder_target.fit(y_train)
    file_path=dir_path/file_name
    joblib.dump(label_encoder_target,file_path)

dir_path=root_folder_path/'src'/'models'
file_name_label_encoder='Label_Encode_Risk_modelling.pkl'

# triiger the function
save_lable_encode(dir_path,file_name_label_encoder,y_train)


##### We will use same label encoder and preprocessed pipe line for features to tarining the model.

##### Exp-1 -Random Forest classifier

In [118]:
def training_model(model,param_distributions,preprocessing_pipeline,label_encoder,X_train,y_train,save_dir_path,model_name):

    #Encode the features/input
    X_train_encoded=preprocessing_pipeline.transform(X_train)
    #Encode the label/taregt
    y_train_encoded=label_encoder.transform(y_train)
    #fit the model
    # Hyperparameter tuning using RandomizedSearchCV
    search = RandomizedSearchCV(model, param_distributions=param_distributions, n_iter=4, cv=3, n_jobs=-1, random_state=42)
    search.fit(X_train_encoded, y_train_encoded)

    # Best model & hyperparameters
    best_model = search.best_estimator_
    best_params = search.best_params_

    # Save model
    model_path = os.path.join(save_dir_path, f"{model_name}.pkl")
    joblib.dump(best_model, model_path)

    # Training metric (accuracy)
    y_train_pred = best_model.predict(X_train_encoded)
    train_acc = accuracy_score(y_train_encoded, y_train_pred)

    #log the experiment

     # Experiment log
    experiment_data = {
        "model_name": model_name,
        "best_params": best_params,
        "training_metric": train_acc,
        "model_path": model_path
    }

     # Save experiment log
    exp_log_path = os.path.join(save_dir_path, f"{model_name}_experiment.json")
    with open(exp_log_path, "w") as f:
        json.dump(experiment_data, f, indent=4)

    #log the few result
    print(f"Training complete! Best model saved: {model_path}")
    print(f"Best Parameters: {best_params}")
    print(f"Training Accuracy: {train_acc:.4f}")

    return experiment_data




In [143]:
save_dir_path="src/models/model_"

save_dir_path+'RandomForestClassifier'+'pkl'

'src/models/model_RandomForestClassifierpkl'

In [119]:
#call the above function
dir_path=root_folder_path/'src'/'models'
file_name_label_encoder='Label_Encode_Risk_modelling.pkl'
file_name_preprocessor='preprocessing_pipeline_risk_modelling.pkl'
file_path_label_encoder=dir_path/file_name_label_encoder
file_path_preprocessor=dir_path/file_name_preprocessor
#parameters value of function
label_encoder_risk_modelling=joblib.load(file_path_label_encoder)
preprocessor_risk_model=joblib.load(file_path_preprocessor)
dir_path_expr=root_folder_path/'src'/'experiments'
# Define model & hyperparameter grid
model_name='RandomForest'
model = RandomForestClassifier()
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

training_model(model,param_grid,preprocessor_risk_model,label_encoder_risk_modelling,X_train,y_train,dir_path_expr,model_name)


Training complete! Best model saved: c:\Users\dhiraj_choudhary\Music\Credit-Risk-Modelling\src\experiments\RandomForest.pkl
Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': None}
Training Accuracy: 0.9470


{'model_name': 'RandomForest',
 'best_params': {'n_estimators': 200,
  'min_samples_split': 10,
  'max_depth': None},
 'training_metric': 0.9469841054204592,
 'model_path': 'c:\\Users\\dhiraj_choudhary\\Music\\Credit-Risk-Modelling\\src\\experiments\\RandomForest.pkl'}

##### Vlidation of models - we will use same label encoder and preprocessed pipeline of features.

In [120]:
#Prepare the input features and target variable
val_data_file=root_folder_path/'data'/'validation_data'/'cust_val_data.csv'
df_validation=pd.read_csv(val_data_file)
X_val=df_validation[features_list]
y_val=df_validation['Approved_Flag']

In [121]:
df_validation.shape

(6310, 79)

In [122]:
def validate_model(preprocessing_pipeline,label_encoder,experiment_log, X_val, y_val,dir_path):
    """
    Loads the best saved model and evaluates it on the validation dataset.

    Parameters:
    - experiment_log: Path to the experiment JSON log.
    - X_val: Validation features.
    - y_val: Validation labels.

    Returns:
    - Validation accuracy.
    """


    #Encode the features/input
    X_val_encoded=preprocessing_pipeline.transform(X_val)
    #Encode the label/taregt
    y_val_encoded=label_encoder.transform(y_val)
    # dir path
    exp_log_path = os.path.join(dir_path, f"{experiment_log}")
    # Load experiment details
    with open(exp_log_path, "r") as f:
        experiment_data = json.load(f)
    
    # Load saved best model
    model_path = experiment_data["model_path"]
    best_model = joblib.load(model_path)

    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val_encoded)
    val_acc = accuracy_score(y_val_encoded, y_val_pred)

    print(f"Validation Accuracy: {val_acc:.4f} (Using best model: {experiment_data['model_name']})")
    return val_acc

In [123]:
# call the function
dir_path=root_folder_path/'src'/'models'
file_name_label_encoder='Label_Encode_Risk_modelling.pkl'
file_name_preprocessor='preprocessing_pipeline_risk_modelling.pkl'
file_path_label_encoder=dir_path/file_name_label_encoder
file_path_preprocessor=dir_path/file_name_preprocessor
#parameters value of function
label_encoder_risk_modelling=joblib.load(file_path_label_encoder)
preprocessor_risk_model=joblib.load(file_path_preprocessor)
experiment_log='RandomForest_experiment.json'
dir_path_expr=root_folder_path/'src'/'experiments'

validate_model(preprocessor_risk_model,label_encoder_risk_modelling,experiment_log,X_val,y_val,dir_path_expr)

Validation Accuracy: 0.7623 (Using best model: RandomForest)


0.7622820919175911

##### Exp-2 : Decission tree Model

Training

In [126]:
#call the above function
dir_path=root_folder_path/'src'/'models'
file_name_label_encoder='Label_Encode_Risk_modelling.pkl'
file_name_preprocessor='preprocessing_pipeline_risk_modelling.pkl'
file_path_label_encoder=dir_path/file_name_label_encoder
file_path_preprocessor=dir_path/file_name_preprocessor
#parameters value of function
label_encoder_risk_modelling=joblib.load(file_path_label_encoder)
preprocessor_risk_model=joblib.load(file_path_preprocessor)
dir_path_expr=root_folder_path/'src'/'experiments'
# Define model & hyperparameter grid
model_name='DecisionTree'
model = DecisionTreeClassifier()
param_grid = {
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

training_model(model,param_grid,preprocessor_risk_model,label_encoder_risk_modelling,X_train,y_train,dir_path_expr,model_name)

Training complete! Best model saved: c:\Users\dhiraj_choudhary\Music\Credit-Risk-Modelling\src\experiments\DecisionTree.pkl
Best Parameters: {'min_samples_split': 10, 'max_depth': 10}
Training Accuracy: 0.8065


{'model_name': 'DecisionTree',
 'best_params': {'min_samples_split': 10, 'max_depth': 10},
 'training_metric': 0.8065140605895939,
 'model_path': 'c:\\Users\\dhiraj_choudhary\\Music\\Credit-Risk-Modelling\\src\\experiments\\DecisionTree.pkl'}

Validation/Testing

In [127]:
# call the function
dir_path=root_folder_path/'src'/'models'
file_name_label_encoder='Label_Encode_Risk_modelling.pkl'
file_name_preprocessor='preprocessing_pipeline_risk_modelling.pkl'
file_path_label_encoder=dir_path/file_name_label_encoder
file_path_preprocessor=dir_path/file_name_preprocessor
#parameters value of function
label_encoder_risk_modelling=joblib.load(file_path_label_encoder)
preprocessor_risk_model=joblib.load(file_path_preprocessor)
experiment_log='DecisionTree_experiment.json'
dir_path_expr=root_folder_path/'src'/'experiments'

validate_model(preprocessor_risk_model,label_encoder_risk_modelling,experiment_log,X_val,y_val,dir_path_expr)

Validation Accuracy: 0.7471 (Using best model: DecisionTree)


0.747068145800317