# Propensity Score Matching

**Author:** Daniel Cavalli <br>
**Last Update:** 2024-11-08

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [34]:
def prepare_data_for_psm(df):
    """
    Prepare data by handling missing values and creating features
    """
    # Create lists of column names by type
    precip_cols = [col for col in df.columns if 'precipitacao' in col]
    pressao_cols = [col for col in df.columns if 'pressao' in col]
    temp_cols = [col for col in df.columns if 'temperatura' in col]
    umidade_cols = [col for col in df.columns if 'umidade' in col]
    vento_cols = [col for col in df.columns if 'vento' in col]
    
    # Create imputer
    imputer = SimpleImputer(strategy='mean')
    
    # Create copy of dataframe
    df_clean = df.copy()
    
    # Impute missing values for each group of climate variables
    for cols in [precip_cols, pressao_cols, temp_cols, umidade_cols, vento_cols]:
        if cols:  # Check if list is not empty
            df_clean[cols] = imputer.fit_transform(df_clean[cols])
    
    # Calculate yearly averages for each climate variable type
    df_clean['avg_precip'] = df_clean[precip_cols].mean(axis=1)
    df_clean['avg_pressao'] = df_clean[pressao_cols].mean(axis=1)
    df_clean['avg_temp'] = df_clean[temp_cols].mean(axis=1)
    df_clean['avg_umidade'] = df_clean[umidade_cols].mean(axis=1)
    df_clean['avg_vento'] = df_clean[vento_cols].mean(axis=1)
    
    return df_clean

In [35]:
def get_treatment_events(df):
    """
    Identify all treatment events (when a region first receives treatment)
    """
    treatment_events = []
    
    # Get all treated regions
    treated_regions = df[df['treatment'] == 1]['id_microrregiao'].unique()
    
    for region in treated_regions:
        region_data = df[df['id_microrregiao'] == region].sort_values('ano')
        # Find first year of treatment
        treatment_year = region_data[region_data['treatment'] == 1]['ano'].min()
        
        treatment_events.append({
            'region': region,
            'treatment_year': treatment_year
        })
    
    return pd.DataFrame(treatment_events)

In [36]:
def match_for_treatment_event(df, treated_region, treatment_year, features, caliper=0.2):
    """
    Perform matching for a single treatment event
    """
    # Get data up to treatment year
    df_period = df[df['ano'] <= treatment_year].copy()
    
    # Create treatment indicator for this specific matching
    df_period['current_treatment'] = ((df_period['id_microrregiao'] == treated_region) & 
                                    (df_period['ano'] == treatment_year)).astype(int)
    
    # Get potential controls (all observations from other regions before treatment year)
    potential_controls = df_period[
        (df_period['id_microrregiao'] != treated_region) & 
        (df_period['ano'] < treatment_year)
    ]
    
    # Get treated observation
    treated_obs = df_period[df_period['current_treatment'] == 1]
    
    if len(treated_obs) == 0 or len(potential_controls) == 0:
        return None
    
    # Prepare data for propensity score calculation
    matching_data = pd.concat([treated_obs, potential_controls])
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(matching_data[features])
    
    # Calculate propensity scores
    model = LogisticRegression(random_state=42)
    model.fit(X, matching_data['current_treatment'])
    propensity_scores = model.predict_proba(X)[:, 1]
    
    # Add propensity scores to the data
    matching_data['propensity_score'] = propensity_scores
    
    # Get treated unit's propensity score
    treated_score = matching_data[matching_data['current_treatment'] == 1]['propensity_score'].iloc[0]
    
    # Calculate distances for all potential controls
    potential_controls = matching_data[matching_data['current_treatment'] == 0].copy()
    potential_controls['distance'] = abs(potential_controls['propensity_score'] - treated_score)
    
    # Apply caliper
    caliper_threshold = caliper * np.std(matching_data['propensity_score'])
    potential_controls = potential_controls[potential_controls['distance'] <= caliper_threshold]
    
    # Select matches (can select multiple controls per treated unit)
    matches = potential_controls.nsmallest(n=5, columns='distance')  # Get top 5 matches
    
    # Combine treated and matched controls
    matched_data = pd.concat([treated_obs, matches])
    matched_data['treated_region'] = treated_region
    matched_data['treatment_year'] = treatment_year
    
    return matched_data

In [37]:
def perform_dynamic_psm(df):
    """
    Perform PSM analysis allowing for dynamic treatment timing and multiple control matches
    """
    # Prepare data
    df_clean = prepare_data_for_psm(df)
    
    # Get list of treatment events
    treatment_events = get_treatment_events(df_clean)
    
    # Features for matching
    features = [
        'avg_precip', 
        'avg_pressao', 
        'avg_temp', 
        'avg_umidade', 
        'avg_vento'
    ]
    
    # Perform matching for each treatment event
    all_matches = []
    
    for _, event in treatment_events.iterrows():
        matches = match_for_treatment_event(
            df_clean,
            event['region'],
            event['treatment_year'],
            features
        )
        
        if matches is not None:
            all_matches.append(matches)
    
    # Combine all matches
    if all_matches:
        matched_df = pd.concat(all_matches, ignore_index=True)
        return matched_df
    else:
        return None

In [None]:
def perform_psm_matching(df):
    """
    Perform PSM matching for DiD analysis
    """
    # Prepare data
    df_clean = prepare_data_for_psm(df)
    
    # Get list of treatment events
    treatment_events = get_treatment_events(df_clean)
    
    # Features for matching
    features = [
        'avg_precip', 
        'avg_pressao', 
        'avg_temp', 
        'avg_umidade', 
        'avg_vento'
    ]
    
    # Perform matching for each treatment event
    all_matches = []
    
    for _, event in treatment_events.iterrows():
        matches = match_for_treatment_event(
            df_clean,
            event['region'],
            event['treatment_year'],
            features
        )
        
        if matches is not None:
            all_matches.append(matches)
    
    # Combine all matches
    if all_matches:
        matched_df = pd.concat(all_matches, ignore_index=True)
        
        # Add matching information
        matched_df['match_id'] = (matched_df['treated_region'].astype(str) + '_' + 
                                 matched_df['treatment_year'].astype(str))
        
        return matched_df
    else:
        return None

In [39]:
# Read data
df = pd.read_csv('../data/PAM_MET_pivoted.csv')

# Perform dynamic PSM analysis
matched_df = perform_dynamic_psm(df)

if matched_df is not None:
    # Calculate treatment effects
    effects_df = calculate_treatment_effects(matched_df)
    
    print("\nMatching Summary:")
    print(f"Total treatment events analyzed: {len(effects_df)}")
    print(f"Average number of controls per treatment: {effects_df['n_controls'].mean():.2f}")
    print(f"Average treatment effect: {effects_df['effect'].mean():.2f}")
    
    # Save results
    matched_df.to_csv('matched_data.csv', index=False)
    effects_df.to_csv('treatment_effects.csv', index=False)
else:
    print("No valid matches found.")


Matching Summary:
Total treatment events analyzed: 370
Average number of controls per treatment: 2.63
Average treatment effect: -22328.08


In [33]:
print(f"Number of matched pairs: {len(matched_df) // 2}")
print(f"Average production value (treated): {treated_production:.2f}")
print(f"Average production value (control): {control_production:.2f}")

Number of matched pairs: 336
Average production value (treated): 196766.20
Average production value (control): 271633.04
