In [3]:
# !pip install missingno

## Libraries

In [32]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import missingno as msno
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

### Directory for saving visuals

In [31]:
# Create a directory for saving visuals
output_dir = "missingness_analysis_visuals"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")
else:
    print(f"Directory already exists: {output_dir}")

# Create subdirectories for better organization
subdirs = ['matrix', 'bar_charts', 'heatmaps', 'dendrograms', 'individual_level', 'temporal']
for subdir in subdirs:
    subdir_path = os.path.join(output_dir, subdir)
    if not os.path.exists(subdir_path):
        os.makedirs(subdir_path)

# Set higher quality parameters for all plots
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.1
plt.rcParams['figure.figsize'] = (12, 8)

Created directory: missingness_analysis_visuals


## Load the data

In [13]:
df = pd.read_csv("cleaned_data/df_processed.csv")
df_copy = df.copy()

### Subset the dataframe

In [14]:
df = df[[
    'individual_id','age', 'bmi'
    , 'hiv_status_derived', 'hiv_status_derived_age', 'hpt_status_derived', 'hpt_status_derived_age', 'diab_status_derived', 'diab_status_derived_age',
    'obese_status_derived', 'tb_status_derived'
    ,'stroke_status_derived', 'stroke_status_derived_age'
]]

In [43]:
# # Basic information
# print("Dataset Shape:", df.shape)
# print("\nFirst 5 rows:")
# print(df.head())

# print("\nData Types:")
# print(df.dtypes)

# print("\nBasic Statistics:")
# print(df.describe())

## Comprehensive Missingness Analysis

In [23]:
def comprehensive_missingness_analysis(df):
    """Comprehensive missing value analysis"""
    
    print("="*50)
    print("COMPREHENSIVE MISSINGNESS ANALYSIS")
    print("="*50)
    
    # 1. Overall missingness
    total_missing = df.isnull().sum().sum()
    total_cells = df.size
    missing_percentage = (total_missing / total_cells) * 100
    
    print(f"Total missing values: {total_missing}")
    print(f"Percentage of missing data: {missing_percentage:.2f}%")
    
    # 2. Missing values by column
    missing_by_column = df.isnull().sum()
    missing_percentage_by_column = (missing_by_column / len(df)) * 100
    
    print("\nMissing values by column:")
    missing_info = pd.DataFrame({
        'Missing_Count': missing_by_column,
        'Missing_Percentage': missing_percentage_by_column,
        'Data_Type': df.dtypes
    })
    missing_info = missing_info[missing_info['Missing_Count'] > 0]
    
    if len(missing_info) > 0:
        print(missing_info.sort_values('Missing_Percentage', ascending=False))
    else:
        print("No missing values found in any column!")
    
    return missing_info

# Analysis
missing_info = comprehensive_missingness_analysis(df)

COMPREHENSIVE MISSINGNESS ANALYSIS
Total missing values: 52082
Percentage of missing data: 8.57%

Missing values by column:
                      Missing_Count  Missing_Percentage Data_Type
tb_status_derived             24072           51.469991   float64
bmi                           17310           37.011696   float64
obese_status_derived          10700           22.878402   float64


### Visualize Missing Data Patterns

In [33]:
def visualize_missingness_with_saving(df, prefix=""):
    """Create visualizations for missing data patterns and save them"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 1. Matrix plot
    plt.figure(figsize=(14, 8))
    msno.matrix(df)
    plt.title('Missing Data Matrix Pattern', fontsize=16, pad=20)
    plt.tight_layout()
    filename = f"missingness_matrix_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, 'matrix', filename), 
                dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved matrix plot: {filename}")
    
    # 2. Bar plot of missing values
    plt.figure(figsize=(14, 8))
    missing_counts = df.isnull().sum().sort_values(ascending=False)
    missing_counts = missing_counts[missing_counts > 0]
    
    if len(missing_counts) > 0:
        ax = missing_counts.plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title('Missing Values by Column', fontsize=16, pad=20)
        plt.xlabel('Columns', fontsize=12)
        plt.ylabel('Number of Missing Values', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        
        # Add value labels on bars
        for i, v in enumerate(missing_counts):
            ax.text(i, v + 0.01 * max(missing_counts), str(v), 
                   ha='center', va='bottom', fontsize=10)
        
        plt.tight_layout()
        filename = f"missingness_barchart_{prefix}_{timestamp}.png"
        plt.savefig(os.path.join(output_dir, 'bar_charts', filename), 
                    dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved bar chart: {filename}")
    
    # 3. Heatmap of missing data correlation
    plt.figure(figsize=(12, 10))
    msno.heatmap(df, cmap='RdYlBu_r')
    plt.title('Correlation of Missingness Between Variables', fontsize=16, pad=20)
    plt.tight_layout()
    filename = f"missingness_heatmap_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, 'heatmaps', filename), 
                dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved heatmap: {filename}")
    
    # 4. Dendrogram for missing data patterns
    plt.figure(figsize=(14, 8))
    msno.dendrogram(df)
    plt.title('Dendrogram of Missing Data Patterns', fontsize=16, pad=20)
    plt.tight_layout()
    filename = f"missingness_dendrogram_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, 'dendrograms', filename), 
                dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved dendrogram: {filename}")

# Run the enhanced visualization
visualize_missingness_with_saving(df, prefix="full_dataset")

Saved matrix plot: missingness_matrix_full_dataset_20250826_105342.png
Saved bar chart: missingness_barchart_full_dataset_20250826_105342.png
Saved heatmap: missingness_heatmap_full_dataset_20250826_105342.png
Saved dendrogram: missingness_dendrogram_full_dataset_20250826_105342.png


<Figure size 4200x2400 with 0 Axes>

<Figure size 3600x3000 with 0 Axes>

<Figure size 4200x2400 with 0 Axes>

In [35]:
df.columns

Index(['individual_id', 'age', 'bmi', 'hiv_status_derived',
       'hiv_status_derived_age', 'hpt_status_derived',
       'hpt_status_derived_age', 'diab_status_derived',
       'diab_status_derived_age', 'obese_status_derived', 'tb_status_derived',
       'stroke_status_derived', 'stroke_status_derived_age'],
      dtype='object')

## Individual Level missingness

### Create individual profiles

In [36]:
def create_individual_missingness_profile(df, patient_id_col='patient_id'):
    """
    Create a dataframe with missingness metrics for each individual patient
    """
    # Group by patient and calculate missingness metrics
    individual_profiles = df.groupby(patient_id_col).apply(
        lambda x: pd.Series({
            'total_records': len(x),
            'total_missing_values': x.isnull().sum().sum(),
            'missing_percentage': (x.isnull().sum().sum() / (len(x) * len(x.columns))) * 100,
            'has_stroke': x['stroke'].max() if 'stroke' in x.columns else 0,
            'first_record_date': x['date'].min() if 'date' in x.columns else None,
            'last_record_date': x['date'].max() if 'date' in x.columns else None,
            'records_with_complete_data': len(x[x.isnull().sum(axis=1) == 0]),
            'records_with_any_missing': len(x[x.isnull().sum(axis=1) > 0])
        })
    ).reset_index()
    
    # Add column-specific missingness (example for key variables)
    key_columns = ['age', 'bmi', 'hypertension', 'avg_glucose_level', 'smoking_status']
    for col in key_columns:
        if col in df.columns:
            col_missing = df.groupby(patient_id_col)[col].apply(
                lambda x: (x.isnull().sum() / len(x)) * 100
            )
            individual_profiles[f'{col}_missing_pct'] = individual_profiles[patient_id_col].map(
                col_missing.to_dict()
            )
    
    return individual_profiles

# Create individual profiles - ADJUST patient_id_col IF NEEDED!
individual_profiles = create_individual_missingness_profile(df, patient_id_col='individual_id')

In [37]:
def analyze_by_record_count_with_saving(individual_profiles, prefix=""):
    """Analyze how missingness relates to number of records per patient and save visuals"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Categorize patients by number of records
    individual_profiles['record_category'] = pd.cut(
        individual_profiles['total_records'],
        bins=[0, 1, 2, 5, 10, np.inf],
        labels=['1 record', '2 records', '3-5 records', '6-10 records', '10+ records']
    )
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Missingness by record category
    sns.boxplot(data=individual_profiles, x='record_category', y='missing_percentage', 
                ax=ax1, palette='viridis')
    ax1.set_title('Missing Data Percentage by Number of Records', fontsize=14, pad=15)
    ax1.set_xlabel('Number of Records', fontsize=12)
    ax1.set_ylabel('Missing Percentage', fontsize=12)
    ax1.tick_params(axis='x', rotation=45)
    
    # Stroke prevalence by record category
    stroke_by_records = individual_profiles.groupby('record_category')['has_stroke'].mean()
    bars = ax2.bar(range(len(stroke_by_records)), stroke_by_records.values, 
                   color='lightcoral', edgecolor='darkred', alpha=0.8)
    ax2.set_title('Stroke Prevalence by Number of Records', fontsize=14, pad=15)
    ax2.set_xlabel('Number of Records', fontsize=12)
    ax2.set_ylabel('Proportion with Stroke', fontsize=12)
    ax2.set_xticks(range(len(stroke_by_records)))
    ax2.set_xticklabels(stroke_by_records.index, rotation=45)
    
    # Add value labels on bars
    for i, v in enumerate(stroke_by_records.values):
        ax2.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    filename = f"record_count_analysis_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, 'individual_level', filename), 
                dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved record count analysis: {filename}")
    
    return individual_profiles

# Run enhanced analysis
analyze_by_record_count_with_saving(individual_profiles, prefix="stroke_data")

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


Saved record count analysis: record_count_analysis_stroke_data_20250826_112456.png


Unnamed: 0,individual_id,total_records,total_missing_values,missing_percentage,has_stroke,first_record_date,last_record_date,records_with_complete_data,records_with_any_missing,age_missing_pct,bmi_missing_pct,record_category
0,000025C5-5811-4942-8957-8A1A4FF1460F,2.0,3.0,11.538462,0.0,,,0.0,2.0,0.0,50.0,2 records
1,0004A623-93CA-4D53-B10C-805B27EFD98F,1.0,1.0,7.692308,0.0,,,0.0,1.0,0.0,0.0,1 record
2,000D523D-F549-40B3-B8E1-2FF517084EB0,2.0,3.0,11.538462,0.0,,,0.0,2.0,0.0,50.0,2 records
3,000E6165-4313-43DE-BBFE-87581AB48FD4,3.0,1.0,2.564103,0.0,,,2.0,1.0,0.0,0.0,3-5 records
4,000EE8FA-5B5B-4917-9CF9-D15A41AF1765,2.0,3.0,11.538462,0.0,,,0.0,2.0,0.0,50.0,2 records
...,...,...,...,...,...,...,...,...,...,...,...,...
25481,LZYHV,1.0,2.0,15.384615,0.0,,,0.0,1.0,0.0,100.0,1 record
25482,LZYMB,2.0,0.0,0.000000,0.0,,,2.0,0.0,0.0,0.0,2 records
25483,LZZHL,1.0,3.0,23.076923,0.0,,,0.0,1.0,0.0,100.0,1 record
25484,LZZOV,2.0,1.0,3.846154,0.0,,,1.0,1.0,0.0,0.0,2 records


### Comparison visuals

In [38]:
def compare_single_vs_multi_record_with_saving(individual_profiles, prefix=""):
    """Compare data completeness and save visuals"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    single_record = individual_profiles[individual_profiles['total_records'] == 1]
    multi_record = individual_profiles[individual_profiles['total_records'] > 1]
    
    # Create comparison visualization
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))
    
    # Missingness comparison
    comparison_data = pd.DataFrame({
        'Single Record': single_record['missing_percentage'],
        'Multiple Records': multi_record['missing_percentage']
    })
    
    sns.boxplot(data=comparison_data, ax=ax1, palette='pastel')
    ax1.set_title('Missing Data Percentage Comparison', fontsize=14, pad=15)
    ax1.set_ylabel('Missing Percentage', fontsize=12)
    
    # Stroke prevalence comparison
    stroke_single = single_record['has_stroke'].mean()
    stroke_multi = multi_record['has_stroke'].mean()
    
    stroke_comparison = pd.DataFrame({
        'Group': ['Single Record', 'Multiple Records'],
        'Stroke Prevalence': [stroke_single, stroke_multi]
    })
    
    bars = sns.barplot(data=stroke_comparison, x='Group', y='Stroke Prevalence', 
                       ax=ax2, palette='coolwarm', alpha=0.8)
    ax2.set_title('Stroke Prevalence Comparison', fontsize=14, pad=15)
    ax2.set_ylabel('Proportion with Stroke', fontsize=12)
    
    # Add value labels on bars
    for i, v in enumerate([stroke_single, stroke_multi]):
        ax2.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=12)
    
    # Record count distribution
    record_counts = [len(single_record), len(multi_record)]
    ax3.pie(record_counts, labels=['Single Record', 'Multiple Records'], 
            autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
    ax3.set_title('Distribution of Patients by Record Count', fontsize=14, pad=15)
    
    plt.tight_layout()
    filename = f"single_vs_multi_comparison_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, 'individual_level', filename), 
                dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved single vs multi comparison: {filename}")
    
    return single_record, multi_record

# Run enhanced comparison
single_rec_patients, multi_rec_patients = compare_single_vs_multi_record_with_saving(individual_profiles)

Saved single vs multi comparison: single_vs_multi_comparison__20250826_112658.png


### Comprehensive summary (dashboard)

In [39]:
def create_summary_dashboard(df, individual_profiles, prefix=""):
    """Create a comprehensive summary dashboard and save it"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    fig = plt.figure(figsize=(20, 16))
    
    # Create subplot grid
    gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)
    
    # Plot 1: Overall missingness by column
    ax1 = fig.add_subplot(gs[0, 0])
    missing_counts = df.isnull().sum().sort_values(ascending=False)
    missing_counts = missing_counts[missing_counts > 0]
    missing_counts.plot(kind='bar', ax=ax1, color='lightsteelblue', edgecolor='black')
    ax1.set_title('A. Missing Values by Column', fontsize=14, pad=15)
    ax1.tick_params(axis='x', rotation=45)
    
    # Plot 2: Missingness correlation heatmap
    ax2 = fig.add_subplot(gs[0, 1])
    msno.heatmap(df, cmap='coolwarm', ax=ax2)
    ax2.set_title('B. Missingness Correlation Heatmap', fontsize=14, pad=15)
    
    # Plot 3: Record count distribution
    ax3 = fig.add_subplot(gs[1, 0])
    record_counts = individual_profiles['total_records'].value_counts().sort_index()
    record_counts.plot(kind='bar', ax=ax3, color='lightgreen', edgecolor='black')
    ax3.set_title('C. Distribution of Records per Patient', fontsize=14, pad=15)
    ax3.set_xlabel('Number of Records')
    ax3.set_ylabel('Number of Patients')
    
    # Plot 4: Missingness by record count
    ax4 = fig.add_subplot(gs[1, 1])
    sns.boxplot(data=individual_profiles, x='record_category', y='missing_percentage', 
                ax=ax4, palette='viridis')
    ax4.set_title('D. Missingness by Record Count Category', fontsize=14, pad=15)
    ax4.tick_params(axis='x', rotation=45)
    
    # Plot 5: Stroke prevalence comparison
    ax5 = fig.add_subplot(gs[2, 0])
    single_record = individual_profiles[individual_profiles['total_records'] == 1]
    multi_record = individual_profiles[individual_profiles['total_records'] > 1]
    
    stroke_data = [single_record['has_stroke'].mean(), multi_record['has_stroke'].mean()]
    bars = ax5.bar(['Single Record', 'Multiple Records'], stroke_data, 
                   color=['lightblue', 'lightcoral'], edgecolor='black', alpha=0.8)
    ax5.set_title('E. Stroke Prevalence by Record Count', fontsize=14, pad=15)
    ax5.set_ylabel('Proportion with Stroke')
    
    # Add value labels
    for i, v in enumerate(stroke_data):
        ax5.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=12)
    
    # Plot 6: Text summary
    ax6 = fig.add_subplot(gs[2, 1])
    ax6.axis('off')
    
    # Create summary text
    summary_text = [
        "SUMMARY STATISTICS:",
        f"Total patients: {len(individual_profiles):,}",
        f"Total records: {df.shape[0]:,}",
        f"Overall missingness: {df.isnull().sum().sum() / df.size * 100:.1f}%",
        f"Single-record patients: {len(single_record):,} ({len(single_record)/len(individual_profiles)*100:.1f}%)",
        f"Multi-record patients: {len(multi_record):,} ({len(multi_record)/len(individual_profiles)*100:.1f}%)",
        f"Stroke prevalence (single): {stroke_data[0]:.3f}",
        f"Stroke prevalence (multi): {stroke_data[1]:.3f}"
    ]
    
    ax6.text(0.1, 0.9, "\n".join(summary_text), transform=ax6.transAxes, 
             fontsize=12, va='top', linespacing=1.5,
             bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.7))
    
    plt.suptitle(f'Missingness Analysis Dashboard - Stroke Dataset\n{timestamp}', 
                 fontsize=16, y=0.98)
    plt.tight_layout()
    
    filename = f"summary_dashboard_{prefix}_{timestamp}.png"
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved summary dashboard: {filename}")

# Create and save the dashboard
create_summary_dashboard(df, individual_profiles, prefix="stroke_analysis")

Saved summary dashboard: summary_dashboard_stroke_analysis_20250826_112923.png


### Exporting the data in Excel

In [40]:
def export_analysis_to_excel(df, individual_profiles, missing_info):
    """Export all analysis results to an Excel file with multiple sheets"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    excel_filename = f"missingness_analysis_report_{timestamp}.xlsx"
    excel_path = os.path.join(output_dir, excel_filename)
    
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        # Raw data with missingness indicators
        df_with_missing = df.copy()
        for col in df.columns:
            df_with_missing[f'{col}_missing'] = df[col].isnull().astype(int)
        
        df_with_missing.to_excel(writer, sheet_name='Raw_Data_With_Missing_Flags', index=False)
        
        # Individual profiles
        individual_profiles.to_excel(writer, sheet_name='Individual_Profiles', index=False)
        
        # Missingness by column
        missing_by_column = pd.DataFrame({
            'column': df.columns,
            'missing_count': df.isnull().sum(),
            'missing_percentage': (df.isnull().sum() / len(df)) * 100,
            'data_type': df.dtypes
        }).sort_values('missing_percentage', ascending=False)
        
        missing_by_column.to_excel(writer, sheet_name='Missingness_By_Column', index=False)
        
        # Summary statistics
        summary_data = {
            'Metric': [
                'Total Patients',
                'Total Records', 
                'Average Records per Patient',
                'Total Missing Values',
                'Overall Missing Percentage',
                'Single-record Patients',
                'Multi-record Patients',
                'Stroke Prevalence (Overall)',
                'Stroke Prevalence (Single-record)',
                'Stroke Prevalence (Multi-record)'
            ],
            'Value': [
                len(individual_profiles),
                len(df),
                len(df) / len(individual_profiles),
                df.isnull().sum().sum(),
                (df.isnull().sum().sum() / df.size) * 100,
                len(individual_profiles[individual_profiles['total_records'] == 1]),
                len(individual_profiles[individual_profiles['total_records'] > 1]),
                individual_profiles['has_stroke'].mean(),
                individual_profiles[individual_profiles['total_records'] == 1]['has_stroke'].mean(),
                individual_profiles[individual_profiles['total_records'] > 1]['has_stroke'].mean()
            ]
        }
        
        pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary_Statistics', index=False)
    
    print(f"Exported comprehensive analysis to: {excel_path}")
    return excel_path

# Export to Excel
excel_file = export_analysis_to_excel(df, individual_profiles, missing_info)

Exported comprehensive analysis to: missingness_analysis_visuals/missingness_analysis_report_20250826_113056.xlsx


In [26]:
print(df.stroke_status_derived.value_counts())
df.obese_status_derived.value_counts()

stroke_status_derived
0    44748
1     2021
Name: count, dtype: int64


obese_status_derived
0.0    25706
1.0    10363
Name: count, dtype: int64

## Analyze Missing Data Mechanisms

In [18]:
def analyze_missing_mechanisms(df, target_column='stroke'):
    """Is missingness related to other variables"""
    
    print("="*50)
    print("MISSING DATA MECHANISM ANALYSIS")
    print("="*50)
    
    # Check if missingness is related to target variable (stroke)
    for column in df.columns:
        if df[column].isnull().sum() > 0:
            # Indicator for missing values
            missing_indicator = df[column].isnull().astype(int)
            
            if target_column in df.columns:
                # Comparing target distribution between missing and non-missing
                non_missing_target = df.loc[~df[column].isnull(), target_column]
                missing_target = df.loc[df[column].isnull(), target_column]
                
                if len(missing_target) > 0 and len(non_missing_target) > 0:
                    # T-test for continuous target, chi-square for categorical
                    if df[target_column].dtype in ['int64', 'float64']:
                        t_stat, p_value = stats.ttest_ind(
                            non_missing_target.dropna(),
                            missing_target.dropna(),
                            equal_var=False
                        )
                        print(f"{column}: t-stat={t_stat:.3f}, p-value={p_value:.3f}")
                    else:
                        # Chi-square test (for categorical target) 
                        contingency_table = pd.crosstab(missing_indicator, df[target_column])
                        chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
                        print(f"{column}: chi2={chi2:.3f}, p-value={p_value:.3f}")
            
            # Correlation with other variables
            print(f"\nCorrelation of missingness in {column} with other variables:")
            for other_col in df.columns:
                if other_col != column and df[other_col].dtype in ['int64', 'float64']:
                    correlation = df[other_col].corr(missing_indicator)
                    if not pd.isna(correlation) and abs(correlation) > 0.1:
                        print(f"  {other_col}: correlation={correlation:.3f}")

analyze_missing_mechanisms(df)

MISSING DATA MECHANISM ANALYSIS

Correlation of missingness in bmi with other variables:
  age: correlation=-0.211
  hiv_status_derived: correlation=-0.200
  hiv_status_derived_age: correlation=-0.194
  hpt_status_derived_age: correlation=-0.159
  diab_status_derived_age: correlation=-0.204
  obese_status_derived: correlation=-0.197
  stroke_status_derived_age: correlation=-0.209

Correlation of missingness in obese_status_derived with other variables:
  age: correlation=0.110
  bmi: correlation=-0.897
  hiv_status_derived: correlation=0.100
  hiv_status_derived_age: correlation=0.112
  hpt_status_derived: correlation=0.229
  hpt_status_derived_age: correlation=0.124
  diab_status_derived: correlation=0.231
  diab_status_derived_age: correlation=0.112
  tb_status_derived: correlation=0.129
  stroke_status_derived_age: correlation=0.110

Correlation of missingness in tb_status_derived with other variables:
  age: correlation=-0.341
  hiv_status_derived: correlation=-0.120
  hiv_status_d

### Advanced Missingness Analysis for Stroke Data

In [19]:
def stroke_specific_missingness_analysis(df):
    """Stroke-specific missingness analysis"""
    
    # Analyze missingness patterns by stroke status
    if 'stroke' in df.columns:
        print("Missingness patterns by stroke status:")
        
        stroke_missing = df[df['stroke'] == 1].isnull().mean() * 100
        no_stroke_missing = df[df['stroke'] == 0].isnull().mean() * 100
        
        missing_comparison = pd.DataFrame({
            'Stroke_Patients_Missing': stroke_missing,
            'Non_Stroke_Patients_Missing': no_stroke_missing,
            'Difference': stroke_missing - no_stroke_missing
        })
        
        # Filtering columns with meaningful differences
        meaningful_diff = missing_comparison[
            (abs(missing_comparison['Difference']) > 2) & 
            ((stroke_missing > 5) | (no_stroke_missing > 5))
        ]
        
        if len(meaningful_diff) > 0:
            print("Significant differences in missingness patterns:")
            print(meaningful_diff.sort_values('Difference', ascending=False))
        else:
            print("No significant differences in missingness patterns between stroke and non-stroke patients")

stroke_specific_missingness_analysis(df)

### Generate Missingness Report

In [28]:
def generate_missingness_report(df):
    """Comprehensive missingness report"""
    
    report = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'total_missing_values': df.isnull().sum().sum(),
        'overall_missing_percentage': (df.isnull().sum().sum() / df.size) * 100,
        'columns_with_missing': [],
        'missingness_patterns': {}
    }
    
    # Analyzing columns with missing values
    for column in df.columns:
        missing_count = df[column].isnull().sum()
        if missing_count > 0:
            column_info = {
                'column_name': column,
                'data_type': str(df[column].dtype),
                'missing_count': missing_count,
                'missing_percentage': (missing_count / len(df)) * 100,
                'unique_values': df[column].nunique() if df[column].dtype == 'object' else None
            }
            report['columns_with_missing'].append(column_info)
    
    # Saving the report
    report_df = pd.DataFrame(report['columns_with_missing'])
    if len(report_df) > 0:
        report_df.to_csv('stroke_output/missingness_report.csv', index=False)
        print("Missingness report saved to 'missingness_report.csv'")
    
    return report

# Generating & displaying results
missingness_report = generate_missingness_report(df)
print("\nMissingness Report Summary:")
print(f"Total rows: {missingness_report['total_rows']}")
print(f"Total missing values: {missingness_report['total_missing_values']}")
print(f"Overall missing percentage: {missingness_report['overall_missing_percentage']:.2f}%")

Missingness report saved to 'missingness_report.csv'

Missingness Report Summary:
Total rows: 46769
Total missing values: 52082
Overall missing percentage: 8.57%
