In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras_tuner import HyperModel, RandomSearch
from tensorflow.keras.callbacks import ModelCheckpoint

# 1. Load Data
df = pd.read_csv('estadistical.csv')

# EDA: Understand the data
print("\n--- Head of DataFrame ---")
print(df.head())

print("\n--- Summary Statistics ---")
print(df.describe())

# Visualize numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    plt.figure(figsize=(6, 2))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Visualize categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    plt.figure(figsize=(6, 2))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Countplot of {col}')
    plt.tight_layout()
    plt.show()

# 2. Data Preprocessing
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

X = df.drop('Receive/ Not receive credit ', axis=1)
y = df['Receive/ Not receive credit '] - 1  # Convert to 0 and 1

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# 3. Handle Class Imbalance (Using Class Weights)
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

# 4. Build a Hypermodel for Hyperparameter Tuning
class MyHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential([
            layers.Input(shape=(X_train.shape[1],)),
            layers.Dense(hp.Int('units', min_value=64, max_value=128, step=32), activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(hp.Int('units_2', min_value=32, max_value=64, step=16), activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
                      loss='binary_crossentropy',
                      metrics=['accuracy', 'AUC'])
        return model

# Initialize HyperModel
hypermodel = MyHyperModel()
tuner = RandomSearch(hypermodel, objective='val_accuracy', max_trials=5, executions_per_trial=3, directory='tuner_dir')

# 5. Train the Model with Hyperparameter Tuning and Save Best Model Using ModelCheckpoint
checkpoint_cb = ModelCheckpoint("best_model.h5", save_best_only=True)
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, class_weight=class_weight_dict, callbacks=[checkpoint_cb])

# After tuning is complete, explicitly load the best model
try:
    best_model = keras.models.load_model("best_model.h5")
except FileNotFoundError:
    print("Error: 'best_model.h5' file not found. Make sure the model was saved correctly during training.")

# 6. Evaluate the Model
y_pred_prob = best_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)

print("\nTest Accuracy:", acc)
print("ROC AUC:", auc_roc)
print("F1 Score:", f1)
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# 7. Save the Model and Report
best_model.save("credit_model_improved.h5")
print("Model saved as 'credit_model_improved.h5'")

# Save classification report
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
report_df.to_csv("classification_report_improved.csv")
print("Classification report saved as 'classification_report_improved.csv'")

ModuleNotFoundError: No module named 'sklearn'

In [None]:
def predict_with_simplified_input(model_path, df, scaler, label_encoders):
    import random
    from tensorflow.keras.models import load_model

    model = keras.models.load_model(model_path)

    # Get feature names from training data for consistency
    feature_names = df.drop('Receive/ Not receive credit ', axis=1).columns.tolist()

    input_data = {}

    # Important user inputs
    input_data['Duration in month'] = int(input("Enter Duration in months (e.g., 12): "))
    input_data['Credit amount'] = float(input("Enter Credit Amount (e.g., 2500): "))
    input_data['Age in years'] = int(input("Enter Age (e.g., 35): "))
    input_data['Present residence since'] = int(input("Enter Years at Current Residence (e.g., 3): "))
    input_data['Job'] = input("Enter Job code (e.g., A171/A172/A173/A174): ")
    input_data['Number of existing credits at this bank'] = int(input("Enter Number of Existing Credits (e.g., 1): "))
    input_data['Number of people being liable to provide maintenance for'] = int(input("Enter Number of Dependents (e.g., 1): "))

    # Handle missing or default values for other columns (use median)
    for col in feature_names:
        if col == 'Job':
            # Encode Job using label encoder
            input_data[col] = label_encoders['Job'].transform([input_data[col]])[0]
        elif col not in input_data:
            # Use median for other missing columns
            input_data[col] = float(df[col].median())

    # Ensure input_data is in correct order as per training features
    input_data_ordered = [input_data[col] for col in feature_names]

    # Create DataFrame and scale
    input_df = pd.DataFrame([input_data_ordered], columns=feature_names)
    input_scaled = scaler.transform(input_df)

    # Predict
    prediction = model.predict(input_scaled)[0][0]
    result = "✅ Will Receive Credit" if prediction >= 0.5 else "❌ Will Not Receive Credit"
    print(f"\nPrediction: {result}")
    print(f"Confidence Score: {prediction:.2f}")

predict_with_simplified_input("credit_model_improved.h5", df, scaler, le_dict)



Enter Duration in months (e.g., 12): 24
Enter Credit Amount (e.g., 2500): 5000
Enter Age (e.g., 35): 30
Enter Years at Current Residence (e.g., 3): 5
Enter Job code (e.g., A171/A172/A173/A174): A172
Enter Number of Existing Credits (e.g., 1): 2
Enter Number of Dependents (e.g., 1): 1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step

Prediction: ✅ Will Receive Credit
Confidence Score: 0.65
