In [1]:
import pandas as pd
df = pd.read_csv("../data/synthetic_social_protection.csv")
df.head()

Unnamed: 0,person_id,age,gender,district,urban_rural,education_level,income_level,scheme,enrolled,benefit_amount
0,1,51,Male,Huye,Rural,Tertiary,Upper-Middle,LongTermSavings,True,1704.082096
1,2,14,Female,Ruhango,Rural,Secondary,Upper-Middle,CommunityHealth,True,18670.720206
2,3,71,Male,Gakenke,Rural,Secondary,Lower-Middle,Pension,False,0.0
3,4,60,Female,Kirehe,Rural,Primary,Low,CommunityHealth,False,0.0
4,5,20,Male,Nyamasheke,Rural,Secondary,Upper-Middle,Pension,True,18146.590625


In [2]:
enroll_rate = df["enrolled"].mean()
n_people = len(df)
print(f"People: {n_people:,} | Enrollment rate: {enroll_rate:.1%}")

df.groupby("district")["enrolled"].mean().sort_values(ascending=False).head(10)

People: 100,000 | Enrollment rate: 47.1%


district
Nyagatare     0.493527
Bugesera      0.485252
Muhanga       0.484343
Gakenke       0.482415
Burera        0.480798
Gicumbi       0.480000
Nyarugenge    0.478792
Kicukiro      0.478504
Gisagara      0.478001
Musanze       0.475714
Name: enrolled, dtype: float64

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import pandas as pd

feats = ["age","gender","urban_rural","education_level","income_level","scheme"]
X = pd.get_dummies(df[feats], drop_first=True)
y = df["enrolled"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]

print("ROC AUC:", round(roc_auc_score(y_test, probs), 3))
print(classification_report(y_test, (probs >= 0.5).astype(int)))

ROC AUC: 0.578
              precision    recall  f1-score   support

           0       0.57      0.74      0.65     13220
           1       0.57      0.38      0.46     11780

    accuracy                           0.57     25000
   macro avg       0.57      0.56      0.55     25000
weighted avg       0.57      0.57      0.56     25000

