In [1]:
import os
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

In [2]:
df = pd.read_csv('Dataset.csv')
print('Loaded dataset', 'shape=', df.shape)

Loaded dataset shape= (121856, 40)


  df = pd.read_csv('Dataset.csv')


In [12]:
# List of potential financial columns to check
potential_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity','Age_Days','Population_Region_Relative','Employed_Days','ID_Days','Registration_Days','Score_Source_3']

for col in potential_cols:
    if col in df.columns and df[col].dtype == 'object':
        print(f"Auto-cleaning column: {col}...")
        df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
        df[col] = df[col].replace('nan', np.nan)
        df[col] = pd.to_numeric(df[col], errors='coerce')

Auto-cleaning column: ID_Days...


In [16]:
import pandas as pd
import numpy as np

cols_to_clean_cat = {
    'Accompany_Client': '##',
    'Client_Gender': 'XNA',
    'Type_Organization': 'XNA'
}

for col, bad_value in cols_to_clean_cat.items():
    if col in df.columns:
        print(f"Cleaning {col}: Replacing '{bad_value}' with NaN")
        df[col] = df[col].replace(bad_value, np.nan)

# A. Population_Region_Relative (Handle 100)
if 'Population_Region_Relative' in df.columns:
    df.loc[df['Population_Region_Relative'] == 100, 'Population_Region_Relative'] = np.nan
    df['Population_Region_Relative'] = pd.to_numeric(df['Population_Region_Relative'], errors='coerce')

# B. Employed_Days (Handle 365243)
if 'Employed_Days' in df.columns:
    # 365243 usually denotes "Retired" or "Unemployed" in these datasets
    df.loc[df['Employed_Days'] == 365243, 'Employed_Days'] = np.nan
    df['Employed_Days'] = pd.to_numeric(df['Employed_Days'], errors='coerce')

# C. Score_Source_2 (Handle 100)
if 'Score_Source_2' in df.columns:
    df.loc[df['Score_Source_2'] == 100, 'Score_Source_2'] = np.nan
    df['Score_Source_2'] = pd.to_numeric(df['Score_Source_2'], errors='coerce')

# D. Phone_Change (Handle 0)
if 'Phone_Change' in df.columns:
    df.loc[df['Phone_Change'] == 0, 'Phone_Change'] = np.nan
    df['Phone_Change'] = pd.to_numeric(df['Phone_Change'], errors='coerce')


Cleaning Accompany_Client: Replacing '##' with NaN
Cleaning Client_Gender: Replacing 'XNA' with NaN
Cleaning Type_Organization: Replacing 'XNA' with NaN

--- Summary of Missing Values After Cleaning ---
Accompany_Client               1758
Client_Gender                  2416
Type_Organization             24694
Population_Region_Relative     4870
Employed_Days                 24764
Score_Source_2                 5692
Phone_Change                  18219
dtype: int64


In [24]:
df.to_csv("Dataset2.csv")

In [5]:
df.describe()

Unnamed: 0,ID,Client_Income,Car_Owned,Bike_Owned,Active_Loan,House_Own,Child_Count,Credit_Amount,Loan_Annuity,Own_House_Age,Mobile_Tag,Homephone_Tag,Workphone_Working,Client_Family_Members,Cleint_City_Rating,Application_Process_Day,Application_Process_Hour,Score_Source_1,Score_Source_2,Social_Circle_Default,Phone_Change,Credit_Bureau,Default
count,121856.0,118234.0,118275.0,118232.0,118221.0,118195.0,118218.0,118219.0,117030.0,41761.0,121856.0,121856.0,121856.0,119446.0,119447.0,119428.0,118193.0,53021.0,116170.0,59928.0,118192.0,103316.0,121856.0
mean,12160930.0,16865.19,0.342854,0.332262,0.499175,0.69206,0.417779,60046.488951,2721.254492,12.157324,0.999992,0.200499,0.281201,2.154329,2.030038,3.159736,12.0631,0.501213,0.518625,0.117428,962.106056,1.891082,0.080792
std,35176.94,11538.15,0.474665,0.471026,0.500001,0.461644,0.728802,40350.663346,1461.466162,12.056079,0.002865,0.400375,0.449587,0.912686,0.504407,1.759045,3.280695,0.211204,0.740248,0.107974,827.976726,1.861921,0.272517
min,12100000.0,2565.0,0.0,0.0,0.0,0.0,0.0,4500.0,217.35,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.014568,5e-06,0.0,0.0,0.0,0.0
25%,12130460.0,11250.0,0.0,0.0,0.0,0.0,0.0,27000.0,1657.35,5.0,1.0,0.0,0.0,2.0,2.0,2.0,10.0,0.333481,0.390164,0.0577,272.0,0.0,0.0
50%,12160930.0,14400.0,0.0,0.0,0.0,1.0,0.0,51750.0,2499.75,9.0,1.0,0.0,0.0,2.0,2.0,3.0,12.0,0.504657,0.564978,0.0887,755.0,1.0,0.0
75%,12191390.0,20250.0,1.0,1.0,1.0,1.0,1.0,80865.0,3466.8,15.0,1.0,0.0,1.0,3.0,2.0,5.0,14.0,0.67389,0.664011,0.1485,1570.0,3.0,0.0
max,12221860.0,1800009.0,1.0,1.0,1.0,1.0,19.0,405000.0,22500.0,69.0,1.0,1.0,1.0,16.0,3.0,6.0,23.0,0.945741,100.0,1.0,4185.0,22.0,1.0


In [23]:
from ydata_profiling import ProfileReport

# 1. Initialize
profile = ProfileReport(df, title="Loan Default Deep Dive", explorative=True)

# 2. Configure using the Hybrid Syntax
# Dictionary lookup ["pearson"] -> Object attribute setting .calculate

# Correlations
profile.config.correlations["pearson"].calculate = True
profile.config.correlations["spearman"].calculate = True
profile.config.correlations["kendall"].calculate = True

# Interactions
profile.config.interactions.continuous = True

# Missing Values (These are usually booleans in the config object directly)
profile.config.missing_diagrams["heatmap"] = True
profile.config.missing_diagrams["dendrogram"] = True

# 3. Generate
profile.to_file("loan_default_eda_report2.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                                                                                                                                                                | 0/40 [00:00<?, ?it/s][A
  2%|█████▍                                                                                                                                                                                                                  | 1/40 [00:02<01:00,  1.55s/it][A
  5%|██████████▊                                                                                                                                                                                                             | 2/40 [00:02<00:36,  1.04it/s][A
  8%|████████████████▏                                                                                                                                                                                                       | 3/40 [00

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
import sweetviz as sv
report = sv.analyze(df, target_feat="Default")
report.show_html("sweetviz_report.html")

                                             |                                                                …

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
