# AltScore: Data Exploration

## 1. Setup & Configuration

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os

warnings.filterwarnings('ignore')

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

# Create directories if they don't exist
os.makedirs('reports', exist_ok=True)
os.makedirs('visualizations', exist_ok=True)

print("‚úì Environment setup complete")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

‚úì Environment setup complete
Analysis started at: 2026-02-15 23:36:15


## 2. Data Dictionary Analysis

In [5]:
print("="*100)
print("STEP 1: DATA DICTIONARY ANALYSIS")
print("="*100)

# Read data dictionary
col_desc = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\HomeCredit_columns_description.csv", encoding='latin-1')

print(f"\nüìä Total columns documented: {len(col_desc)}")
print(f"\nüìã Data Dictionary Preview:")
print(col_desc.head(15))

# Analyze column distribution by table
if 'Table' in col_desc.columns:
    print("\nüìë Columns by Table:")
    print(col_desc['Table'].value_counts())

# Save for reference
col_desc.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\column_descriptions.csv", index=False)
print("\n‚úì Data dictionary saved to reports/column_descriptions.csv")

STEP 1: DATA DICTIONARY ANALYSIS

üìä Total columns documented: 219

üìã Data Dictionary Preview:
    Unnamed: 0                         Table                  Row  \
0            1  application_{train|test}.csv           SK_ID_CURR   
1            2  application_{train|test}.csv               TARGET   
2            5  application_{train|test}.csv   NAME_CONTRACT_TYPE   
3            6  application_{train|test}.csv          CODE_GENDER   
4            7  application_{train|test}.csv         FLAG_OWN_CAR   
5            8  application_{train|test}.csv      FLAG_OWN_REALTY   
6            9  application_{train|test}.csv         CNT_CHILDREN   
7           10  application_{train|test}.csv     AMT_INCOME_TOTAL   
8           11  application_{train|test}.csv           AMT_CREDIT   
9           12  application_{train|test}.csv          AMT_ANNUITY   
10          13  application_{train|test}.csv      AMT_GOODS_PRICE   
11          14  application_{train|test}.csv      NAME_TYPE_SUITE   
12 

## 3. Main Training Data - Deep Dive

In [6]:
print("\n" + "="*100)
print("STEP 2: LOADING & PROFILING APPLICATION_TRAIN.CSV")
print("="*100)

# Load main training data
train = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\application_train.csv")

print(f"\nüì¶ Dataset Dimensions:")
print(f"   ‚Üí Total Applications: {train.shape[0]:,}")
print(f"   ‚Üí Total Features: {train.shape[1]}")
print(f"   ‚Üí Memory Usage: {train.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display first few rows
print("\nüìã First 5 Applications:")
print(train.head())

# Column names
print(f"\nüìù All Feature Names ({len(train.columns)} total):")
for i, col in enumerate(train.columns, 1):
    print(f"   {i:3d}. {col}")
    if i % 20 == 0 and i < len(train.columns):
        print()


STEP 2: LOADING & PROFILING APPLICATION_TRAIN.CSV

üì¶ Dataset Dimensions:
   ‚Üí Total Applications: 307,511
   ‚Üí Total Features: 122
   ‚Üí Memory Usage: 536.69 MB

üìã First 5 Applications:
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0         202500.00   406597.50     24700.50   
1               N             0         270000.00  1293502.50     35698.50   
2               Y             0          67500.00   135000.00      6750.00   
3               Y             0         135000.00   312682.50     29686.50 

## 4. Target Variable - Comprehensive Analysis

In [7]:
print("\n" + "="*100)
print("STEP 3: TARGET VARIABLE ANALYSIS")
print("="*100)

# Target distribution
target_dist = train['TARGET'].value_counts().sort_index()
default_rate = (target_dist[1] / len(train)) * 100
repayment_rate = (target_dist[0] / len(train)) * 100

print("\nüéØ Target Distribution:")
print(f"   ‚Üí Repaid (0):    {target_dist[0]:,} ({repayment_rate:.2f}%)")
print(f"   ‚Üí Defaulted (1): {target_dist[1]:,} ({default_rate:.2f}%)")
print(f"\nüìä Class Imbalance Ratio: 1:{int(target_dist[0]/target_dist[1])} (Default:Repaid)")

# Statistical summary
print("\nüìà Target Statistics:")
print(f"   ‚Üí Mean (avg default rate): {train['TARGET'].mean():.4f}")
print(f"   ‚Üí Std Dev: {train['TARGET'].std():.4f}")
print(f"   ‚Üí Variance: {train['TARGET'].var():.4f}")

# Save detailed target analysis
target_stats = pd.DataFrame({
    'Category': ['Repaid (0)', 'Defaulted (1)'],
    'Count': target_dist.values,
    'Percentage': [repayment_rate, default_rate],
    'Proportion': [target_dist[0]/len(train), target_dist[1]/len(train)]
})
target_stats.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\target_distribution.csv", index=False)

print("\n‚úì Target analysis saved to reports/target_distribution.csv")


STEP 3: TARGET VARIABLE ANALYSIS

üéØ Target Distribution:
   ‚Üí Repaid (0):    282,686 (91.93%)
   ‚Üí Defaulted (1): 24,825 (8.07%)

üìä Class Imbalance Ratio: 1:11 (Default:Repaid)

üìà Target Statistics:
   ‚Üí Mean (avg default rate): 0.0807
   ‚Üí Std Dev: 0.2724
   ‚Üí Variance: 0.0742

‚úì Target analysis saved to reports/target_distribution.csv


## 5. Data Types & Structure Analysis

In [8]:
print("\n" + "="*100)
print("STEP 4: DATA TYPES & STRUCTURE ANALYSIS")
print("="*100)

# Data type distribution
dtype_counts = train.dtypes.value_counts()
print("\nüîç Data Type Distribution:")
for dtype, count in dtype_counts.items():
    print(f"   ‚Üí {str(dtype):15s}: {count:3d} columns ({count/len(train.columns)*100:.1f}%)")

# Separate columns by type
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

print(f"\nüìä Feature Categories:")
print(f"   ‚Üí Numeric features: {len(numeric_cols)}")
print(f"   ‚Üí Categorical features: {len(categorical_cols)}")

# Categorical feature details
if categorical_cols:
    print("\nüìã Categorical Features:")
    cat_summary = []
    for col in categorical_cols:
        unique_count = train[col].nunique()
        cat_summary.append({
            'Feature': col,
            'Unique_Values': unique_count,
            'Sample_Values': ', '.join(train[col].dropna().unique()[:3].astype(str))
        })
    
    cat_df = pd.DataFrame(cat_summary)
    print(cat_df.to_string(index=False))
    cat_df.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\categorical_features_summary.csv", index=False)

# Numeric feature statistics
print("\nüìä Numeric Features - Quick Stats:")
numeric_stats = train[numeric_cols].describe().T
numeric_stats['missing_%'] = (train[numeric_cols].isnull().sum() / len(train) * 100).values
print(numeric_stats.head(10))
numeric_stats.to_csv('reports/numeric_features_statistics.csv')

print("\n‚úì Data type analysis saved")


STEP 4: DATA TYPES & STRUCTURE ANALYSIS

üîç Data Type Distribution:
   ‚Üí float64        :  65 columns (53.3%)
   ‚Üí int64          :  41 columns (33.6%)
   ‚Üí object         :  16 columns (13.1%)

üìä Feature Categories:
   ‚Üí Numeric features: 106
   ‚Üí Categorical features: 16

üìã Categorical Features:
                   Feature  Unique_Values                                                      Sample_Values
        NAME_CONTRACT_TYPE              2                                        Cash loans, Revolving loans
               CODE_GENDER              3                                                          M, F, XNA
              FLAG_OWN_CAR              2                                                               N, Y
           FLAG_OWN_REALTY              2                                                               Y, N
           NAME_TYPE_SUITE              7                             Unaccompanied, Family, Spouse, partner
          NAME_INCOME_TYPE  

## 6. Missing Values - Detailed Analysis

In [9]:
print("\n" + "="*100)
print("STEP 5: COMPREHENSIVE MISSING VALUES ANALYSIS")
print("="*100)

# Calculate missing values
missing_count = train.isnull().sum()
missing_pct = (missing_count / len(train)) * 100

# Create detailed missing value report
missing_df = pd.DataFrame({
    'Feature': train.columns,
    'Missing_Count': missing_count.values,
    'Missing_Percentage': missing_pct.values,
    'Data_Type': train.dtypes.values,
    'Non_Missing_Count': len(train) - missing_count.values
})

# Filter and sort
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print(f"\nüîç Missing Value Summary:")
print(f"   ‚Üí Columns with missing values: {len(missing_df)}/{len(train.columns)}")
print(f"   ‚Üí Columns with >50% missing: {len(missing_df[missing_df['Missing_Percentage'] > 50])}")
print(f"   ‚Üí Columns with >80% missing: {len(missing_df[missing_df['Missing_Percentage'] > 80])}")
print(f"   ‚Üí Average missing % per column: {missing_pct.mean():.2f}%")

# Categorize missing value severity
missing_df['Severity'] = pd.cut(
    missing_df['Missing_Percentage'],
    bins=[0, 5, 20, 50, 80, 100],
    labels=['Low (<5%)', 'Moderate (5-20%)', 'High (20-50%)', 'Very High (50-80%)', 'Critical (>80%)']
)

print("\nüìä Missing Value Severity Distribution:")
print(missing_df['Severity'].value_counts())

# Top 20 features with missing values
print("\n‚ö†Ô∏è  Top 20 Features with Highest Missing Values:")
print(missing_df.head(20).to_string(index=False))

# Save detailed report
missing_df.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\missing_values_analysis.csv", index=False)

# Create missing value heatmap data for later visualization
missing_matrix = train[missing_df.head(30)['Feature'].tolist()].isnull().astype(int)
missing_matrix.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\missing_values_matrix.csv", index=False)

print("\n‚úì Missing value analysis saved to reports/missing_values_analysis.csv")


STEP 5: COMPREHENSIVE MISSING VALUES ANALYSIS

üîç Missing Value Summary:
   ‚Üí Columns with missing values: 67/122
   ‚Üí Columns with >50% missing: 41
   ‚Üí Columns with >80% missing: 0
   ‚Üí Average missing % per column: 24.40%

üìä Missing Value Severity Distribution:
Severity
Very High (50-80%)    41
Low (<5%)             10
High (20-50%)          9
Moderate (5-20%)       7
Critical (>80%)        0
Name: count, dtype: int64

‚ö†Ô∏è  Top 20 Features with Highest Missing Values:
                 Feature  Missing_Count  Missing_Percentage Data_Type  Non_Missing_Count           Severity
         COMMONAREA_MEDI         214865               69.87   float64              92646 Very High (50-80%)
         COMMONAREA_MODE         214865               69.87   float64              92646 Very High (50-80%)
          COMMONAREA_AVG         214865               69.87   float64              92646 Very High (50-80%)
NONLIVINGAPARTMENTS_MODE         213514               69.43   float64      

## 7. Load All Supplementary Datasets

In [11]:
print("\n" + "="*100)
print("STEP 6: LOADING ALL SUPPLEMENTARY DATASETS")
print("="*100)

print("\nüì• Loading datasets...")

# Load all supplementary files with progress indication
datasets = {}

print("   [1/6] Loading bureau.csv...")
datasets['bureau'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\bureau.csv")

print("   [2/6] Loading bureau_balance.csv...")
datasets['bureau_balance'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\bureau_balance.csv")

print("   [3/6] Loading previous_application.csv...")
datasets['previous_application'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\previous_application.csv")

print("   [4/6] Loading POS_CASH_balance.csv...")
datasets['POS_CASH_balance'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\POS_CASH_balance.csv")

print("   [5/6] Loading credit_card_balance.csv...")
datasets['credit_card_balance'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\credit_card_balance.csv")

print("   [6/6] Loading installments_payments.csv...")
datasets['installments_payments'] = pd.read_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\data\installments_payments.csv")

print("\n‚úì All datasets loaded successfully")

# Store for easy access
bureau = datasets['bureau']
bureau_balance = datasets['bureau_balance']
previous_app = datasets['previous_application']
pos_cash = datasets['POS_CASH_balance']
credit_card = datasets['credit_card_balance']
installments = datasets['installments_payments']


STEP 6: LOADING ALL SUPPLEMENTARY DATASETS

üì• Loading datasets...
   [1/6] Loading bureau.csv...
   [2/6] Loading bureau_balance.csv...
   [3/6] Loading previous_application.csv...
   [4/6] Loading POS_CASH_balance.csv...
   [5/6] Loading credit_card_balance.csv...
   [6/6] Loading installments_payments.csv...

‚úì All datasets loaded successfully


## 8. Dataset Summary & Profiling

In [12]:
print("\n" + "="*100)
print("STEP 7: COMPREHENSIVE DATASET PROFILING")
print("="*100)

# Create comprehensive dataset summary
datasets_info = pd.DataFrame({
    'Dataset': [
        'application_train',
        'bureau',
        'bureau_balance',
        'previous_application',
        'POS_CASH_balance',
        'credit_card_balance',
        'installments_payments'
    ],
    'Rows': [
        len(train),
        len(bureau),
        len(bureau_balance),
        len(previous_app),
        len(pos_cash),
        len(credit_card),
        len(installments)
    ],
    'Columns': [
        train.shape[1],
        bureau.shape[1],
        bureau_balance.shape[1],
        previous_app.shape[1],
        pos_cash.shape[1],
        credit_card.shape[1],
        installments.shape[1]
    ]
})

# Add memory usage
datasets_info['Memory_MB'] = [
    train.memory_usage(deep=True).sum() / 1024**2,
    bureau.memory_usage(deep=True).sum() / 1024**2,
    bureau_balance.memory_usage(deep=True).sum() / 1024**2,
    previous_app.memory_usage(deep=True).sum() / 1024**2,
    pos_cash.memory_usage(deep=True).sum() / 1024**2,
    credit_card.memory_usage(deep=True).sum() / 1024**2,
    installments.memory_usage(deep=True).sum() / 1024**2
]

# Add key columns
datasets_info['Primary_Key'] = [
    'SK_ID_CURR',
    'SK_ID_CURR, SK_ID_BUREAU',
    'SK_ID_BUREAU',
    'SK_ID_CURR, SK_ID_PREV',
    'SK_ID_PREV',
    'SK_ID_PREV',
    'SK_ID_PREV'
]

print("\nüìä Complete Dataset Overview:")
print(datasets_info.to_string(index=False))

# Total statistics
print(f"\nüìà Aggregate Statistics:")
print(f"   ‚Üí Total Records Across All Tables: {datasets_info['Rows'].sum():,}")
print(f"   ‚Üí Total Features: {datasets_info['Columns'].sum()}")
print(f"   ‚Üí Total Memory Usage: {datasets_info['Memory_MB'].sum():.2f} MB")

# Save summary
datasets_info.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\datasets_summary.csv", index=False)
print("\n‚úì Dataset summary saved to reports/datasets_summary.csv")


STEP 7: COMPREHENSIVE DATASET PROFILING

üìä Complete Dataset Overview:
              Dataset     Rows  Columns  Memory_MB              Primary_Key
    application_train   307511      122     536.69               SK_ID_CURR
               bureau  1716428       17     512.11 SK_ID_CURR, SK_ID_BUREAU
       bureau_balance 27299925        3    1926.61             SK_ID_BUREAU
 previous_application  1670214       37    1900.63   SK_ID_CURR, SK_ID_PREV
     POS_CASH_balance 10001358        8    1137.25               SK_ID_PREV
  credit_card_balance  3840312       23     875.69               SK_ID_PREV
installments_payments 13605401        8     830.41               SK_ID_PREV

üìà Aggregate Statistics:
   ‚Üí Total Records Across All Tables: 58,441,149
   ‚Üí Total Features: 218
   ‚Üí Total Memory Usage: 7719.39 MB

‚úì Dataset summary saved to reports/datasets_summary.csv


## 9. Data Relationship & Coverage Analysis

In [13]:
print("\n" + "="*100)
print("STEP 8: DATA RELATIONSHIP & COVERAGE ANALYSIS")
print("="*100)

# Analyze how many applications have data in each table
total_apps = len(train)

# Bureau coverage
apps_in_bureau = bureau['SK_ID_CURR'].nunique()
bureau_coverage = (apps_in_bureau / total_apps) * 100

# Previous application coverage
apps_in_prev = previous_app['SK_ID_CURR'].nunique()
prev_coverage = (apps_in_prev / total_apps) * 100

# POS/Cash coverage (through previous applications)
prev_ids_in_pos = pos_cash['SK_ID_PREV'].nunique()
total_prev_ids = previous_app['SK_ID_PREV'].nunique()
pos_coverage_of_prev = (prev_ids_in_pos / total_prev_ids) * 100 if total_prev_ids > 0 else 0

# Credit card coverage
prev_ids_in_cc = credit_card['SK_ID_PREV'].nunique()
cc_coverage_of_prev = (prev_ids_in_cc / total_prev_ids) * 100 if total_prev_ids > 0 else 0

# Installments coverage
prev_ids_in_inst = installments['SK_ID_PREV'].nunique()
inst_coverage_of_prev = (prev_ids_in_inst / total_prev_ids) * 100 if total_prev_ids > 0 else 0

coverage_analysis = pd.DataFrame({
    'Dataset': [
        'Bureau History',
        'Previous Applications',
        'POS/Cash Balance',
        'Credit Card Balance',
        'Installments Payments'
    ],
    'Unique_IDs': [
        apps_in_bureau,
        apps_in_prev,
        prev_ids_in_pos,
        prev_ids_in_cc,
        prev_ids_in_inst
    ],
    'Total_Records': [
        len(bureau),
        len(previous_app),
        len(pos_cash),
        len(credit_card),
        len(installments)
    ],
    'Coverage_Percentage': [
        bureau_coverage,
        prev_coverage,
        (prev_ids_in_pos / total_apps) * 100,
        (prev_ids_in_cc / total_apps) * 100,
        (prev_ids_in_inst / total_apps) * 100
    ],
    'Avg_Records_Per_ID': [
        len(bureau) / apps_in_bureau if apps_in_bureau > 0 else 0,
        len(previous_app) / apps_in_prev if apps_in_prev > 0 else 0,
        len(pos_cash) / prev_ids_in_pos if prev_ids_in_pos > 0 else 0,
        len(credit_card) / prev_ids_in_cc if prev_ids_in_cc > 0 else 0,
        len(installments) / prev_ids_in_inst if prev_ids_in_inst > 0 else 0
    ]
})

print("\nüìä Alternative Data Coverage Analysis:")
print(coverage_analysis.to_string(index=False))

# Check overlap
has_bureau = train['SK_ID_CURR'].isin(bureau['SK_ID_CURR'])
has_prev = train['SK_ID_CURR'].isin(previous_app['SK_ID_CURR'])

overlap_analysis = pd.DataFrame({
    'Scenario': [
        'No Alternative Data',
        'Only Bureau',
        'Only Previous Apps',
        'Both Bureau & Previous Apps'
    ],
    'Count': [
        (~has_bureau & ~has_prev).sum(),
        (has_bureau & ~has_prev).sum(),
        (~has_bureau & has_prev).sum(),
        (has_bureau & has_prev).sum()
    ]
})
overlap_analysis['Percentage'] = (overlap_analysis['Count'] / total_apps) * 100

print("\nüîó Alternative Data Overlap:")
print(overlap_analysis.to_string(index=False))

# Save coverage analysis
coverage_analysis.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\data_coverage.csv", index=False)
overlap_analysis.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\data_overlap_analysis.csv", index=False)

print("\n‚úì Coverage analysis saved")


STEP 8: DATA RELATIONSHIP & COVERAGE ANALYSIS

üìä Alternative Data Coverage Analysis:
              Dataset  Unique_IDs  Total_Records  Coverage_Percentage  Avg_Records_Per_ID
       Bureau History      305811        1716428                99.45                5.61
Previous Applications      338857        1670214               110.19                4.93
     POS/Cash Balance      936325       10001358               304.49               10.68
  Credit Card Balance      104307        3840312                33.92               36.82
Installments Payments      997752       13605401               324.46               13.64

üîó Alternative Data Overlap:
                   Scenario  Count  Percentage
        No Alternative Data   2470        0.80
                Only Bureau  13984        4.55
         Only Previous Apps  41550       13.51
Both Bureau & Previous Apps 249507       81.14

‚úì Coverage analysis saved


## 10. Key Insights & Preliminary Findings

In [14]:
print("\n" + "="*100)
print("STEP 9: KEY INSIGHTS & PRELIMINARY FINDINGS")
print("="*100)

insights = []

# Insight 1: Default rate
insights.append(f"1. DEFAULT RATE: {default_rate:.2f}% - Indicates class imbalance requiring SMOTE/balancing")

# Insight 2: Missing data
high_missing_cols = len(missing_df[missing_df['Missing_Percentage'] > 50])
insights.append(f"2. MISSING DATA: {high_missing_cols} columns have >50% missing values - need robust imputation")

# Insight 3: Alternative data coverage
insights.append(f"3. ALTERNATIVE DATA: {bureau_coverage:.1f}% have bureau history, {prev_coverage:.1f}% have previous applications")

# Insight 4: No alternative data segment
no_alt_data = (~has_bureau & ~has_prev).sum()
no_alt_data_pct = (no_alt_data / total_apps) * 100
insights.append(f"4. TARGET SEGMENT: {no_alt_data:,} applications ({no_alt_data_pct:.1f}%) have NO alternative data")

# Insight 5: Data richness
total_records = datasets_info['Rows'].sum()
insights.append(f"5. DATA RICHNESS: {total_records:,} total records across all tables for feature engineering")

# Insight 6: POS/Cash as utility proxy
pos_records = len(pos_cash)
insights.append(f"6. UTILITY PROXY: {pos_records:,} POS/Cash balance records can proxy utility payment behavior")

# Print insights
print("\nüí° KEY INSIGHTS:")
for insight in insights:
    print(f"   {insight}")

# Save insights
with open(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\preliminary_insights.txt", 'w') as f:
    f.write("PRELIMINARY INSIGHTS - DATA EXPLORATION\n")
    f.write("=" * 80 + "\n\n")
    for insight in insights:
        f.write(insight + "\n")

print("\n‚úì Insights saved to reports/preliminary_insights.txt")


STEP 9: KEY INSIGHTS & PRELIMINARY FINDINGS

üí° KEY INSIGHTS:
   1. DEFAULT RATE: 8.07% - Indicates class imbalance requiring SMOTE/balancing
   2. MISSING DATA: 41 columns have >50% missing values - need robust imputation
   3. ALTERNATIVE DATA: 99.4% have bureau history, 110.2% have previous applications
   4. TARGET SEGMENT: 2,470 applications (0.8%) have NO alternative data
   5. DATA RICHNESS: 58,441,149 total records across all tables for feature engineering
   6. UTILITY PROXY: 10,001,358 POS/Cash balance records can proxy utility payment behavior

‚úì Insights saved to reports/preliminary_insights.txt


## 11. Generate Comprehensive Summary Report

In [15]:
print("\n" + "="*100)
print("STEP 10: GENERATING COMPREHENSIVE SUMMARY REPORT")
print("="*100)

# Create comprehensive statistical summary
summary_stats = pd.DataFrame({
    'Metric': [
        'Total Applications',
        'Total Features (Main)',
        'Default Rate (%)',
        'Repayment Rate (%)',
        'Class Imbalance Ratio',
        'Numeric Features',
        'Categorical Features',
        'Columns with Missing Data',
        'Columns with >50% Missing',
        'Applications with Bureau Data (%)',
        'Applications with Previous Loans (%)',
        'Applications with NO Alternative Data (%)',
        'Total Records (All Tables)',
        'Total Memory Usage (MB)',
        'POS/Cash Balance Records',
        'Installment Payment Records'
    ],
    'Value': [
        f"{len(train):,}",
        f"{train.shape[1]}",
        f"{default_rate:.2f}",
        f"{repayment_rate:.2f}",
        f"1:{int(target_dist[0]/target_dist[1])}",
        f"{len(numeric_cols)}",
        f"{len(categorical_cols)}",
        f"{len(missing_df)}",
        f"{len(missing_df[missing_df['Missing_Percentage'] > 50])}",
        f"{bureau_coverage:.1f}",
        f"{prev_coverage:.1f}",
        f"{no_alt_data_pct:.1f}",
        f"{datasets_info['Rows'].sum():,}",
        f"{datasets_info['Memory_MB'].sum():.2f}",
        f"{len(pos_cash):,}",
        f"{len(installments):,}"
    ]
})

print("\nüìä COMPREHENSIVE STATISTICAL SUMMARY:")
print(summary_stats.to_string(index=False))

# Save summary
summary_stats.to_csv(r"D:\Nithilan\SEM 4\Hackathons\Zenith\reports\statistical_summary.csv", index=False)

print("\n‚úì Statistical summary saved to reports/statistical_summary.csv")


STEP 10: GENERATING COMPREHENSIVE SUMMARY REPORT

üìä COMPREHENSIVE STATISTICAL SUMMARY:
                                   Metric      Value
                       Total Applications    307,511
                    Total Features (Main)        122
                         Default Rate (%)       8.07
                       Repayment Rate (%)      91.93
                    Class Imbalance Ratio       1:11
                         Numeric Features        106
                     Categorical Features         16
                Columns with Missing Data         67
                Columns with >50% Missing         41
        Applications with Bureau Data (%)       99.4
     Applications with Previous Loans (%)      110.2
Applications with NO Alternative Data (%)        0.8
               Total Records (All Tables) 58,441,149
                  Total Memory Usage (MB)    7719.39
                 POS/Cash Balance Records 10,001,358
              Installment Payment Records 13,605,401

‚úì Sta

## 12. Analysis Complete - Next Steps

In [16]:
print("\n" + "="*100)
print("üìã DAY 1 DATA EXPLORATION COMPLETE")
print("="*100)

print("\n‚úÖ COMPLETED TASKS:")
print("   1. ‚úì Loaded and profiled 7 datasets")
print("   2. ‚úì Analyzed target variable distribution")
print("   3. ‚úì Comprehensive missing value analysis")
print("   4. ‚úì Data type and structure analysis")
print("   5. ‚úì Alternative data coverage analysis")
print("   6. ‚úì Generated 8 detailed reports")

print("\nüìÅ GENERATED REPORTS:")
reports = [
    'column_descriptions.csv',
    'target_distribution.csv',
    'categorical_features_summary.csv',
    'numeric_features_statistics.csv',
    'missing_values_analysis.csv',
    'datasets_summary.csv',
    'data_coverage.csv',
    'data_overlap_analysis.csv',
    'preliminary_insights.txt',
    'statistical_summary.csv'
]
for i, report in enumerate(reports, 1):
    print(f"   {i:2d}. reports/{report}")

print("\nüéØ NEXT STEPS (Day 2):")
print("   ‚Üí Run 02_eda_analysis.ipynb for visualizations")
print("   ‚Üí Create demographic analysis charts")
print("   ‚Üí Financial feature distributions")
print("   ‚Üí Correlation analysis")
print("   ‚Üí Alternative data insights")

print(f"\n‚è±Ô∏è  Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*100)


üìã DAY 1 DATA EXPLORATION COMPLETE

‚úÖ COMPLETED TASKS:
   1. ‚úì Loaded and profiled 7 datasets
   2. ‚úì Analyzed target variable distribution
   3. ‚úì Comprehensive missing value analysis
   4. ‚úì Data type and structure analysis
   5. ‚úì Alternative data coverage analysis
   6. ‚úì Generated 8 detailed reports

üìÅ GENERATED REPORTS:
    1. reports/column_descriptions.csv
    2. reports/target_distribution.csv
    3. reports/categorical_features_summary.csv
    4. reports/numeric_features_statistics.csv
    5. reports/missing_values_analysis.csv
    6. reports/datasets_summary.csv
    7. reports/data_coverage.csv
    8. reports/data_overlap_analysis.csv
    9. reports/preliminary_insights.txt
   10. reports/statistical_summary.csv

üéØ NEXT STEPS (Day 2):
   ‚Üí Run 02_eda_analysis.ipynb for visualizations
   ‚Üí Create demographic analysis charts
   ‚Üí Financial feature distributions
   ‚Üí Correlation analysis
   ‚Üí Alternative data insights

‚è±Ô∏è  Analysis completed