In [3]:
# ==========================================
# STEP 1: IMPORT LIBRARIES
# ==========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format


# ==========================================
# STEP 2: LOAD DATASETS
# ==========================================
fac = pd.read_csv('UDISE Education Dataset/UDISE 2023-24/facility_data_All State_2023-24/100_fac.csv', low_memory=False)
prof1 = pd.read_csv('UDISE Education Dataset/UDISE 2023-24/profile_data_1_All State_2023-24/100_prof1.csv', low_memory=False)
enr1 = pd.read_csv('UDISE Education Dataset/UDISE 2023-24/enrolment_data_1_All State_2023-24/100_enr1.csv', low_memory=False)
nfhs_5 = pd.read_csv('UDISE Education Dataset/UDISE 2023-24/NFHS_5_India_Districts_Factsheet_Data.csv', low_memory=False)

print("✅ Datasets Loaded")
print(f"fac: {fac.shape}, prof1: {prof1.shape}, enr1: {enr1.shape}, nfhs_5: {nfhs_5.shape}")


# Dictionary to store all dataframes
datasets = {
    'Facility Data (fac)': fac,
    'Profile Data 1 (prof1)': prof1,
    'Enrollment Data 1 (enr1)': enr1,
    'nfhs 5 (nfhs_5)': nfhs_5
}

# Print metadata and columns for each dataset
for name, df in datasets.items():
    print(f"\n{'='*80}")
    print(f"Dataset: {name}")
    print(f"{'='*80}")
    
    # Basic metadata
    print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Column namesa
    print(f"\nColumn Names ({len(df.columns)} total):")
    print("-" * 80)
    for i, col in enumerate(df.columns, 1):
        print(f"{i:3d}. {col}")
    
    # Data types summary
    print(f"\nData Types Summary:")
    print(df.dtypes.value_counts())
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\nColumns with Missing Values:")
        print(missing[missing > 0])
    else:
        print(f"\nNo missing values found.")
    
    # Basic info
    print(f"\nDetailed Info:")
    print(df.info())

print(f"\n{'='*80}")
print("Metadata extraction complete!")
print(f"{'='*80}")

✅ Datasets Loaded
fac: (1471891, 70), prof1: (1471891, 38), enr1: (8234734, 29), nfhs_5: (706, 109)

Dataset: Facility Data (fac)

Shape: 1471891 rows × 70 columns
Memory Usage: 786.07 MB

Column Names (70 total):
--------------------------------------------------------------------------------
  1. pseudocode
  2. building_status
  3. no_building_blocks
  4. pucca_building_blocks
  5. boundary_wall
  6. total_class_rooms
  7. other_rooms
  8. classrooms_in_good_condition
  9. classrooms_needs_minor_repair
 10. classrooms_needs_major_repair
 11. separate_room_for_hm
 12. total_boys_toilet
 13. total_boys_func_toilet
 14. total_girls_toilet
 15. total_girls_func_toilet
 16. total_boys_cwsn_toilet
 17. func_boys_cwsn_friendly
 18. total_girls_cwsn_toilet
 19. func_girls_cwsn_friendly
 20. urinal_boys
 21. urinal_girls
 22. handwash_near_toilet
 23. hand_pump_yn
 24. well_prot_yn
 25. tap_yn
 26. othsrc_yn
 27. well_unprot_yn
 28. pack_water_yn
 29. hand_pump_fun_yn
 30. well_prot_fun_yn
 