In [9]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CSV_PATH = "education_policies100_cleaned.csv"
TRAIN_PATH = "train_policies.csv"
TEST_PATH = "test_policies.csv"

def generate_synthetic_policies(n=500):
    states = ["Karnataka","Maharashtra","Tamil Nadu","Uttar Pradesh","Delhi","Kerala","West Bengal","Gujarat","Rajasthan","Punjab"]
    sectors = ["Primary","Secondary","Higher Education","Vocational","Early Childhood"]
    target_groups = ["Students","Teachers","Rural Students","Urban Students","Women","Disadvantaged Groups","All"]
    statuses = ["Proposed","Implemented","Under Review","Pilot"]
    years = list(range(2015, 2026))
    stakeholders_list = [
        "Ministry of Education, Local NGOs",
        "State Education Department, Private Partners",
        "Teachers' Unions, Community Leaders",
        "Central Government, Donors",
        "EdTech Companies, Universities"
    ]
    funding_ranges = [(0.5,5),(5,20),(20,100),(0.1,0.5)]
    aspects = ["learning outcomes","infrastructure","teacher quality","digital access","early childhood development","vocational skills"]
    interventions = ["grants to schools","teacher training programs","digital device distribution","curriculum reform","scholarship schemes","public-private partnerships"]
    focuses = ["marginalized communities","gender equity","rural accessibility","urban inclusion","STEM education","literacy and numeracy"]
    secondary_aspects = ["community participation","governance","assessment quality","safety standards"]

    records = []
    for i in range(1, n+1):
        policy_id = f"P{1000+i}"
        title = f"{random.choice(['National','State','District'])} {random.choice(sectors)} Education Reform {random.randint(1,99)}"
        sector = random.choice(sectors)
        region = random.choice(states)
        year = random.choice(years)
        target_group = random.choice(target_groups)
        status = random.choice(statuses)
        funding = round(random.uniform(*random.choice(funding_ranges)), 2)
        stakeholders = random.choice(stakeholders_list)
        impact_score = round(random.uniform(0.1, 0.99), 3)
        summary = f"This policy aims to improve {random.choice(aspects)} in {sector} through {random.choice(interventions)} in {region}."
        goals = f"Increase reach by {random.randint(5,40)}% in {random.randint(1,5)} years."
        full_text = f"{summary} Goals: {goals}"

        records.append({
            "policy_id": policy_id,
            "title": title,
            "sector": sector,
            "region": region,
            "year": year,
            "target_group": target_group,
            "status": status,
            "funding_million_usd": funding,
            "stakeholders": stakeholders,
            "impact_score": impact_score,
            "summary": summary,
            "goals": goals,
            "full_text": full_text
        })

    return pd.DataFrame(records)

def preprocess(df):
    df = df.copy()
    # Only use columns that exist
    text_cols = [c for c in ["title", "goals"] if c in df.columns]
    df["text_for_nlp"] = df[text_cols].fillna('').agg(' '.join, axis=1).str.lower()
    return df

# Load data
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"{CSV_PATH} not found. Please provide the cleaned dataset.")
else:
    df = pd.read_csv(CSV_PATH)

# Preprocess and split
df = preprocess(df)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

print(f"Data prepared: {len(train_df)} train, {len(test_df)} test.")

Data prepared: 400 train, 100 test.


In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

MODEL_PATH = "policy_vectorizer.pkl"
MATRIX_PATH = "policy_tfidf_matrix.pkl"

# Load the cleaned dataset (no summary, stakeholders, etc.)
full_df = pd.read_csv("education_policies100_cleaned.csv")

def preprocess(df):
    df = df.copy()
    # Only use columns that exist
    text_cols = [c for c in ["title", "goals"] if c in df.columns]
    df["text_for_nlp"] = df[text_cols].fillna('').agg(' '.join, axis=1).str.lower()
    return df

full_df = preprocess(full_df)

# Train TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

vectorizer.fit(full_df["text_for_nlp"])

tfidf_matrix = vectorizer.transform(full_df["text_for_nlp"]).toarray()

# Save model + matrix
joblib.dump(vectorizer, MODEL_PATH)
joblib.dump({"tfidf_matrix": tfidf_matrix, "df": full_df}, MATRIX_PATH)

print(f"âœ… Model trained and saved to {MODEL_PATH} and {MATRIX_PATH}")

âœ… Model trained and saved to policy_vectorizer.pkl and policy_tfidf_matrix.pkl


In [11]:
import textwrap
import joblib
from sklearn.metrics.pairwise import cosine_similarity

MODEL_PATH = "policy_vectorizer.pkl"
MATRIX_PATH = "policy_tfidf_matrix.pkl"

vectorizer = joblib.load(MODEL_PATH)

data = joblib.load(MATRIX_PATH) 

tfidf_matrix = data["tfidf_matrix"]
df = data["df"]

def answer_query(query, top_k=3):
    query_vec = vectorizer.transform([query.lower()])
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][:top_k]

    print(f"\nðŸ”Ž Query: {query}")
    for idx in top_idx:
        row = df.iloc[idx]
        snippet = textwrap.shorten(str(row["title"]) + ". " + str(row["goals"]), width=250, placeholder="...")
        print(f"\nðŸ“Œ {row['title']} ({row['policy_id']}) | Score={sims[idx]:.3f}")
        print(f"Region: {row['region']} | Year: {row['year']}")
        print(f"Summary: {snippet}")

# Example query
answer_query("teacher training and capacity building initiatives", top_k=3)


ðŸ”Ž Query: teacher training and capacity building initiatives

ðŸ“Œ National Secondary Education Reform 61 (P1453) | Score=0.250
Region: Kerala | Year: 2022
Summary: National Secondary Education Reform 61. Increase teacher quality score by 5% within 4 years and reduce disparities between rural and urban areas.

ðŸ“Œ National Primary Education Reform 60 (P1354) | Score=0.247
Region: Delhi | Year: 2025
Summary: National Primary Education Reform 60. Increase teacher quality score by 7% within 2 years and reduce disparities between rural and urban areas.

ðŸ“Œ District Primary Education Reform 22 (P1427) | Score=0.247
Region: Karnataka | Year: 2023
Summary: District Primary Education Reform 22. Increase teacher quality score by 5% within 3 years and reduce disparities between rural and urban areas.
