<a href="https://colab.research.google.com/github/evildead23151/UIDAI-Hackathon-Analysis/blob/main/04_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Load Base Data

In [2]:
import pandas as pd
import numpy as np

BASE_IN = "/content/cleaned_updates"

enrol_df = pd.read_csv(f"{BASE_IN}/clean_enrolment.csv")
demo_df = pd.read_csv(f"{BASE_IN}/clean_demographic_updates.csv")
bio_df = pd.read_csv(f"{BASE_IN}/clean_biometric_updates.csv")

enrol_df.shape, demo_df.shape, bio_df.shape


  enrol_df = pd.read_csv(f"{BASE_IN}/clean_enrolment.csv")


((654553, 7), (169063, 6), (188367, 6))

In [3]:
import os
os.listdir("/content")
os.listdir("/content/cleaned_updates")


['clean_enrolment.csv',
 'demographic_missingness_summary.csv',
 'clean_demographic_updates.csv',
 'enrolment_missingness_summary.csv',
 'clean_biometric_updates.csv',
 'biometric_missingness_summary.csv']

# Fix Demographic Schema

In [4]:
demo_df = pd.read_csv(
    f"{BASE_IN}/clean_demographic_updates.csv",
    header=None,
    skiprows=1,
    names=["date", "state", "district", "pincode", "demo_val_1", "demo_val_2"]
)


# Parse Date Explicitly (time re-introduced here)

In [5]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], errors="coerce")
demo_df["date"] = pd.to_datetime(demo_df["date"], errors="coerce")
bio_df["date"] = pd.to_datetime(bio_df["date"], errors="coerce")


# Resolve Geography Columns (robust, reused)

In [6]:
def resolve_geo_columns(df):
    state = [c for c in df.columns if "state" in c.lower()][0]
    district = [c for c in df.columns if "district" in c.lower()][0]
    return state, district

enrol_state, enrol_district = resolve_geo_columns(enrol_df)
demo_state, demo_district = resolve_geo_columns(demo_df)
bio_state, bio_district = resolve_geo_columns(bio_df)


# Monthly Aggregation (Time Backbone)

In [9]:
def monthly_aggregate(df, state_col, district_col):
    df = df.copy()
    df = df[df["date"].notna()]

    num_cols = df.select_dtypes("number").columns

    out = (
        df
        .set_index("date")
        .groupby([state_col, district_col])
        .resample("M")[num_cols]
        .sum()
        .reset_index()
    )

    out["total_activity"] = out[num_cols].sum(axis=1)
    return out


In [10]:
enrol_m = monthly_aggregate(enrol_df, enrol_state, enrol_district)
demo_m = monthly_aggregate(demo_df, demo_state, demo_district)
bio_m = monthly_aggregate(bio_df, bio_state, bio_district)


  .resample("M")[num_cols]
  .resample("M")[num_cols]
  .resample("M")[num_cols]


In [11]:
enrol_m.head()
demo_m.head()
bio_m.head()


Unnamed: 0,state,district,date,pincode,bio_age_5_17,bio_age_17_,total_activity
0,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,2025-01-31,8929344.0,89.0,904.0,8930337.0
1,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,2025-02-28,4464632.0,3.0,21.0,4464656.0
2,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,2025-03-31,4464634.0,3.0,19.0,4464656.0
3,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,2025-04-30,1488207.0,3.0,8.0,1488218.0
4,ANDAMAN & NICOBAR ISLANDS,NICOBARS,2025-01-31,744301.0,1.0,0.0,744302.0


# Merge Monthly Streams

In [12]:
monthly = (
    enrol_m
    .merge(
        demo_m,
        on=["date", enrol_state, enrol_district],
        how="left",
        suffixes=("_enrol", "_demo")
    )
    .merge(
        bio_m,
        on=["date", enrol_state, enrol_district],
        how="left"
    )
)

monthly = monthly.fillna(0)
monthly.head()


Unnamed: 0,state,district,date,pincode_enrol,age_0_5,age_5_17,age_18_greater,total_activity_enrol,pincode_demo,demo_val_1,demo_val_2,total_activity_demo,pincode,bio_age_5_17,bio_age_17_,total_activity
0,100000,100000,2025-02-28,100000,0.0,0.0,3.0,100003.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100000,100000,2025-03-31,100000,0.0,0.0,1.0,100001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100000,100000,2025-04-30,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100000,100000,2025-05-31,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100000,100000,2025-06-30,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Temporal Growth Features (High Signal)

In [14]:
monthly = monthly.sort_values(["state", "district", "date"])

monthly["enrol_mom_growth"] = (
    monthly
    .groupby(["state", "district"])["total_activity_enrol"]
    .pct_change()
)

monthly["demo_mom_growth"] = (
    monthly
    .groupby(["state", "district"])["total_activity_demo"]
    .pct_change()
)

monthly["bio_mom_growth"] = (
    monthly
    .groupby(["state", "district"])["total_activity"]
    .pct_change()
)


# Volatility / Stability Features

In [15]:
volatility = (
    monthly
    .groupby(["state", "district"])[
        ["enrol_mom_growth", "demo_mom_growth", "bio_mom_growth"]
    ]
    .std()
    .reset_index()
)

volatility.columns = [
    "state", "district",
    "enrol_growth_volatility",
    "demo_growth_volatility",
    "bio_growth_volatility"
]

volatility.head()


Unnamed: 0,state,district,enrol_growth_volatility,demo_growth_volatility,bio_growth_volatility
0,100000,100000,,,
1,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,,,0.416665
2,ANDAMAN & NICOBAR ISLANDS,NICOBARS,,,
3,ANDAMAN & NICOBAR ISLANDS,SOUTH ANDAMAN,,,1.455405
4,ANDAMAN AND NICOBAR ISLANDS,NICOBAR,,,0.442241


# Intensity & Lifecycle Ratios (Time-Averaged)

In [16]:
intensity = (
    monthly
    .groupby(["state", "district"])[
        ["total_activity_enrol", "total_activity_demo", "total_activity"]
    ]
    .mean()
    .reset_index()
)

intensity["demo_to_enrol_intensity"] = (
    intensity["total_activity_demo"] /
    intensity["total_activity_enrol"].replace(0, np.nan)
)

intensity["bio_to_enrol_intensity"] = (
    intensity["total_activity"] /
    intensity["total_activity_enrol"].replace(0, np.nan)
)

intensity = intensity.replace([np.inf, -np.inf], np.nan).fillna(0)
intensity.head()


Unnamed: 0,state,district,total_activity_enrol,total_activity_demo,total_activity,demo_to_enrol_intensity,bio_to_enrol_intensity
0,100000,100000,54546.363636,0.0,0.0,0.0,0.0
1,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,682096.833333,558088.5,1612322.0,0.818195,2.363773
2,ANDAMAN & NICOBAR ISLANDS,NICOBARS,744302.0,0.0,0.0,0.0,0.0
3,ANDAMAN & NICOBAR ISLANDS,SOUTH ANDAMAN,682131.25,434070.5,682114.6,0.636345,0.999976
4,ANDAMAN AND NICOBAR ISLANDS,NICOBAR,682279.333333,372152.5,1736796.0,0.545455,2.54558


# Final Feature Matrix

In [17]:
geo_features = (
    intensity
    .merge(volatility, on=["state", "district"], how="left")
)

geo_features.head()


Unnamed: 0,state,district,total_activity_enrol,total_activity_demo,total_activity,demo_to_enrol_intensity,bio_to_enrol_intensity,enrol_growth_volatility,demo_growth_volatility,bio_growth_volatility
0,100000,100000,54546.363636,0.0,0.0,0.0,0.0,,,
1,ANDAMAN & NICOBAR ISLANDS,ANDAMANS,682096.833333,558088.5,1612322.0,0.818195,2.363773,,,0.416665
2,ANDAMAN & NICOBAR ISLANDS,NICOBARS,744302.0,0.0,0.0,0.0,0.0,,,
3,ANDAMAN & NICOBAR ISLANDS,SOUTH ANDAMAN,682131.25,434070.5,682114.6,0.636345,0.999976,,,1.455405
4,ANDAMAN AND NICOBAR ISLANDS,NICOBAR,682279.333333,372152.5,1736796.0,0.545455,2.54558,,,0.442241


# Save Engineered Features

In [18]:
monthly.to_csv("/content/geo_time_features.csv", index=False)
geo_features.to_csv("/content/geo_final_features.csv", index=False)
