# 📈 Predicting Project Outcomes from Evaluation Text

In this third notebook, I implement a machine learning model using textual data extracted from project evaluations as predictors. The main objectives are:

1. To compare the predictive performance of different sections of the evaluation text in estimating project outcome ratings.
2. To identify the most relevant words or phrases (n-grams) that contribute to the model's predictions.

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
import string
import optuna
import numpy as np
import pickle
import seaborn as sns
import requests
import pdfplumber 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from mord import OrdinalRidge  
from sklearn.model_selection import train_test_split
from io import BytesIO
from tqdm import tqdm

# determine working directories (notebook CWD)
script_dir = os.getcwd()
parent_dir = os.path.dirname(script_dir)
data_dir = parent_dir + '\\1_Data'

# Read data
excel_file = os.path.join(data_dir, 'IEG_ICRR_PPAR_Ratings_2025-03-12.xlsx')
df = pd.read_excel(excel_file)

# Output directory:
output = os.path.join(os.path.dirname(script_dir), '3_Outputs')
output = "C:\\Users\\wb592581\\Documents\\"
data_dir = r"C:\Users\wb592581\Documents"

# Filter the dataset to include only projects approved between 2015 to 2025 that are mapped to the Human Development Practice Group. 
df2 = df[
    (df['Approval FY'] >= 2015) &
    (df['Approval FY'] <= 2025) &
    (df['Practice Group'] == 'HD')
]

## 2. Text Extraction

In [3]:
# Extract full text from links
tqdm.pandas()

def extract_text_from_pdf(url):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            text = ''.join(page.extract_text() or '' for page in pdf.pages)
        return text.strip()
    except Exception as e:
        return f"[ERROR] {e}"

df2['Full Text'] = df2['PDF'].progress_apply(extract_text_from_pdf)

100%|██████████| 328/328 [28:09<00:00,  5.15s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Full Text'] = df2['PDF'].progress_apply(extract_text_from_pdf)


In [4]:
# ----------------------------
# Function to extract sections from full project evaluations
# ----------------------------
def extract_section(text, start_pattern, end_pattern, label):
    # Compile regex patterns (case insensitive)
    start_regex = re.compile(rf"{start_pattern}", re.IGNORECASE)
    end_regex = re.compile(rf"{end_pattern}", re.IGNORECASE)

    # Split text into lines
    lines = text.splitlines()
    start_idx = end_idx = None

    # Identify the index range between start and end patterns
    for i, line in enumerate(lines):
        if start_idx is None and start_regex.search(line.strip()):
            start_idx = i
        elif start_idx is not None and end_regex.search(line.strip()):
            end_idx = i
            break

    # Return the section if both indices are found and valid
    if start_idx is not None and end_idx is not None and end_idx > start_idx:
        return "\n".join(lines[start_idx:end_idx+1])
    else:
        return ""

# ----------------------------
# Apply section extraction to each row
# ----------------------------
def extract_sections(row):
    text = row['Full Text']
    me_section = extract_section(
        text,
        start_pattern=r"\bM&E\s+Design.*Utilization\b", 
        end_pattern=r"\bM&E\s+Quality\s+Rating\b", 
        label="M&E Section"
    )
    me_design = extract_section(
        text,
        start_pattern=r"\bM&E\s+Design\b", 
        end_pattern=r"\bM&E\s+Implementation\b", 
        label="M&E: Design"
    )
    me_implementation = extract_section(
        text,
        start_pattern=r"\bM&E\s+Implementation\b", 
        end_pattern=r"\bM&E\s+Utilization\b", 
        label="M&E: Implementation"
    )
    me_utilization = extract_section(
        text,
        start_pattern=r"\bM&E\s+Utilization\b", 
        end_pattern=r"\bM&E\s+Quality\s+Rating\b", 
        label="M&E: Utilization"
    )
    outcome_section = extract_section(
        text,
        start_pattern=r'[567][\.\)]?\s*Outcome\b.*',
        end_pattern=r'(IEG\s+)?Outcome\s+Rating\b.*',
        label="Outcome Section"
    )
    return pd.Series({
        'M&E Section': me_section,
        'M&E: Design': me_design,
        'M&E: Implementation': me_implementation,
        'M&E: Utilization': me_utilization,
        'Outcome_section': outcome_section
    })

# Apply extraction to the DataFrame
df2[['M&E Section', 'M&E: Design', 'M&E: Implementation', 'M&E: Utilization', 'Outcome_section']] = df2.apply(extract_sections, axis=1)

# ----------------------------
# Convert outcome ratings to numeric scores
# ----------------------------
rating_mapping = {
    'Highly Unsatisfactory': 1, 'Unsatisfactory': 2, 'Moderately Unsatisfactory': 3,
    'Moderately Satisfactory': 4, 'Satisfactory': 5, 'Highly Satisfactory': 6
}
df2['outcome_score'] = df2['IEG Outcome Ratings'].map(rating_mapping)
df2.drop(columns=['IEG Outcome Ratings'], inplace=True, errors='ignore')

# ----------------------------
# Custom words to remove from text
# ----------------------------
custom_stopwords = set([
    "pbf", "ndc", "eur", "rating", "icr", "outcome",
    "unsatisfactory", "satisfactory", "moderately", "highly",
    "monitoring", "design", "implementation", "utilization",
    "lesson", "evaluation", "modest", "bme", "cme", "dlis", "high", 
    "efficiency", "negligible", "substantial", "lessons", "ppmp", "piu",
    "indicators", "independent", "group"
])

# ----------------------------
# Function to clean text for modeling
# ----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

    # Remove full-word stopwords
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in custom_stopwords) + r')\b'
    text = re.sub(pattern, '', text)

    # Final cleanup
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ----------------------------
# Apply text cleaning to selected text fields
# ----------------------------
text_vars = ['Full Text', 'M&E Section', 'M&E: Design', 'M&E: Implementation', 'M&E: Utilization', 'Lesson']
df2_cleaned = df2.copy()
for var in text_vars:
    df2_cleaned[var] = df2_cleaned[var].fillna("").apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[['M&E Section', 'M&E: Design', 'M&E: Implementation', 'M&E: Utilization', 'Outcome_section']] = df2.apply(extract_sections, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[['M&E Section', 'M&E: Design', 'M&E: Implementation', 'M&E: Utilization', 'Outcome_section']] = df2.apply(extract_sections, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

## 3. ML Model

In [None]:
# Dictionary to store results
results = {}
N_TRIALS = 15  # Number of optimization trials

# Loop over each text variable
for var in text_vars:
    print(f"\nProcessing text variable: {var}")
    
    # TF-IDF vectorization with 1- to 3-grams
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=5000, stop_words='english')
    X_tfidf = tfidf.fit_transform(df2_cleaned[var])
    y = df2_cleaned["outcome_score"].astype(int)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

    model_results = {}

    # ------------------ LightGBM ------------------
    def objective_lgbm(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log=True)
        }
        model = LGBMRegressor(**params)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return mean_squared_error(y_test, preds, squared=False)  # RMSE

    study_lgbm = optuna.create_study(direction='minimize')
    study_lgbm.optimize(objective_lgbm, n_trials=N_TRIALS)

    best_lgbm = LGBMRegressor(**study_lgbm.best_params)
    best_lgbm.fit(X_train, y_train)
    model_results['lgbm'] = (best_lgbm, tfidf)

    # ------------------ Random Forest ------------------
    def objective_rf(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 15)
        }
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return mean_squared_error(y_test, preds, squared=False)  # RMSE

    study_rf = optuna.create_study(direction='minimize')
    study_rf.optimize(objective_rf, n_trials=N_TRIALS)

    best_rf = RandomForestRegressor(**study_rf.best_params)
    best_rf.fit(X_train, y_train)
    model_results['rf'] = (best_rf, tfidf)

    # ------------------ Ordinal Regression ------------------
    model_ordinal = OrdinalRidge()
    model_ordinal.fit(X_train, y_train)
    model_results['ordinal'] = (model_ordinal, tfidf)

    # Save models and vectorizer to pickle files
    for model_name, (model, vec) in model_results.items():
        filename = os.path.join(data_dir, f"model_{model_name}_{var}.pkl")
        with open(filename, 'wb') as f:
            pickle.dump({'model': model, 'vectorizer': vec}, f)

    # Store results in dictionary
    results[var] = model_results



Processing text variable: Full Text


## 3. Evaluation Metrics and Variable Importance

In [None]:
metrics_dict = {'lgbm': [], 'rf': [], 'ordinal': []}
labels = ['Full Text', 'M&E Section', 'M&E: Design', 'M&E: Implementation', 'M&E: Utilization', 'Lesson']

for model_type in ['lgbm', 'rf', 'ordinal']:
    for var in text_vars:
        filename = os.path.join(data_dir, f"model_{model_type}_{var}.pkl")
        with open(filename, 'rb') as f:
            data = pickle.load(f)

        model = data['model']
        vec = data['vectorizer']

        X_all = vec.transform(df2_cleaned[var])
        y_all = df2_cleaned["outcome_score"].astype(int)
        _, X_test, _, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

        y_pred = model.predict(X_test)

        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        metrics_dict[model_type].append({'RMSE': rmse, 'MSE': mse, 'MAE': mae})

# Graph
for model_type in ['lgbm', 'rf', 'ordinal']:
    df_metric = pd.DataFrame(metrics_dict[model_type], index=labels)
    df_metric = df_metric[['RMSE', 'MSE', 'MAE']]  # Ordenar columnas
    df_metric.plot(kind='bar', figsize=(10, 5), title=f'{model_type.upper()} - Test Set Prediction Errors')
    plt.xticks(rotation=45)
    plt.ylabel("Error")
    plt.tight_layout()
    plt.show()

In [None]:
# Variable Importance
for model_type in ['lgbm', 'rf', 'ordinal']:
    fig, axes = plt.subplots(1, 6, figsize=(20, 8))
    fig.suptitle(f"Top 10 n-grams por modelo - {model_type.upper()}")
    
    for i, var in enumerate(text_vars):
        filename = os.path.join(dir2, f"model_{model_type}_{var}.pkl")
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        model, vec = data['model'], data['vectorizer']
        
        if model_type == 'ordinal':
            importances = np.abs(model.coef_).flatten()
        else:
            importances = model.feature_importances_

        features = np.array(vec.get_feature_names_out())
        idx = np.argsort(importances)[-10:][::-1]
        
        axes[i].barh(features[idx], importances[idx])
        axes[i].set_title(var)
        axes[i].invert_yaxis()

    plt.tight_layout()
    plt.show()
