# Propensity Score Matching

**Author:** Daniel Cavalli <br>
**Last Update:** 2024-11-08

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [39]:
def prepare_data_for_psm(df):
    """
    Enhanced data preparation with additional features and better handling of outliers
    """
    # Create lists of column names by type
    precip_cols = [col for col in df.columns if 'precipitacao' in col]
    pressao_cols = [col for col in df.columns if 'pressao' in col]
    temp_cols = [col for col in df.columns if 'temp' in col]
    umidade_cols = [col for col in df.columns if 'umidade' in col]
    vento_cols = [col for col in df.columns if 'vento' in col]
    
    # Create copy of dataframe to prevent unintended changes
    df_clean = df.copy()
    
    # Calculate pre-treatment averages for production
    df_clean['avg_pre_production'] = df_clean.groupby('id_microrregiao')['total_valor_producao'].transform(
        lambda x: x.expanding().mean()
    )
    
    # Create imputer with median strategy (more robust to outliers)
    imputer = SimpleImputer(strategy='median')
    
    # Impute missing values for each group of climate variables
    for cols in [precip_cols, pressao_cols, temp_cols, umidade_cols, vento_cols]:
        if cols:  # Check if list is not empty
            df_clean[cols] = imputer.fit_transform(df_clean[cols])
    
    # Calculate yearly averages with outlier capping
    for prefix, cols in [
        ('avg_precip', precip_cols),
        ('avg_pressao', pressao_cols),
        ('avg_temp', temp_cols),
        ('avg_umidade', umidade_cols),
        ('avg_vento', vento_cols)
    ]:
        if cols:
            # Calculate mean
            df_clean[prefix] = df_clean[cols].mean(axis=1)
            
            # Cap outliers at 1st and 99th percentiles
            lower = np.percentile(df_clean[prefix], 1)
            upper = np.percentile(df_clean[prefix], 99)
            df_clean[prefix] = df_clean[prefix].clip(lower, upper)
    
    # Add year fixed effects
    df_clean['year_effect'] = df_clean.groupby('ano')['total_valor_producao'].transform('mean')
    
    # Add region-specific trends
    df_clean['region_trend'] = df_clean.groupby('id_microrregiao')['total_valor_producao'].transform(
        lambda x: np.arange(len(x))
    )
    
    df_clean.dropna(inplace=True)
    return df_clean

In [40]:
def get_treatment_events(df):
    """
    Identify all treatment events (when a region first receives treatment)
    """
    treatment_events = []
    
    # Get all treated regions
    treated_regions = df[df['treatment'] == 1]['id_microrregiao'].unique()
    
    for region in treated_regions:
        region_data = df[df['id_microrregiao'] == region].sort_values('ano')
        # Find first year of treatment
        treatment_year = region_data[region_data['treatment'] == 1]['ano'].min()
        
        treatment_events.append({
            'region': region,
            'treatment_year': treatment_year
        })
    
    return pd.DataFrame(treatment_events)

In [None]:
def match_for_treatment_event(df, treated_region, treatment_year, features, caliper=0.2):  # Reduced caliper
    """
    Enhanced matching with stricter criteria and exact matching on year
    """
    # Get data up to treatment year
    df_period = df[df['ano'] <= treatment_year].copy()
    
    # Create treatment indicator for this specific matching
    df_period['current_treatment'] = ((df_period['id_microrregiao'] == treated_region) & 
                                    (df_period['ano'] == treatment_year)).astype(int)
    
    # Get potential controls from the same year or one year before/after
    potential_controls = df_period[
        (df_period['id_microrregiao'] != treated_region) & 
        (df_period['ano'].between(treatment_year - 4, treatment_year)) &
        (df_period['treatment'] == 0)
    ]
    
    # Get treated observation
    treated_obs = df_period[df_period['current_treatment'] == 1]
    
    if len(treated_obs) == 0 or len(potential_controls) == 0:
        return None
    
    # Prepare data for propensity score calculation
    matching_data = pd.concat([treated_obs, potential_controls])
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(matching_data[features])
    
    # Calculate propensity scores with balanced class weights
    model = LogisticRegression(random_state=42, class_weight='balanced')
    model.fit(X, matching_data['current_treatment'])
    propensity_scores = model.predict_proba(X)[:, 1]
    
    # Add propensity scores to the data
    matching_data['propensity_score'] = propensity_scores
    
    # Get treated unit's propensity score
    treated_score = matching_data[matching_data['current_treatment'] == 1]['propensity_score'].iloc[0]
    
    # Calculate distances for all potential controls
    potential_controls = matching_data[matching_data['current_treatment'] == 0].copy()
    potential_controls['distance'] = abs(potential_controls['propensity_score'] - treated_score)
    
    # Apply stricter caliper
    caliper_threshold = caliper * np.std(matching_data['propensity_score'])
    potential_controls = potential_controls[potential_controls['distance'] <= caliper_threshold]
    
    # Select fewer, better matches
    matches = potential_controls.nsmallest(n=7, columns='distance')  # Reduced from 5 to 3
    
    if len(matches) == 0:
        return None
        
    # Add matching information
    matches['treated_region'] = treated_region
    matches['treatment_year'] = treatment_year
    matches['matched_control'] = 1
    
    treated_obs['treated_region'] = treated_region
    treated_obs['treatment_year'] = treatment_year
    treated_obs['matched_control'] = 0
    
    # Combine treated and matched controls
    matched_data = pd.concat([treated_obs, matches])
    
    return matched_data

In [42]:
def perform_psm_matching(df):
    """
    Enhanced PSM matching with additional features
    """
    # Preparing data
    df_clean = prepare_data_for_psm(df)
    
    # Features for matching
    features = [
        'avg_precip', 
        'avg_pressao', 
        'avg_temp', 
        'avg_umidade', 
        'avg_vento',
        'avg_pre_production',
        'year_effect',
        'region_trend'
    ]
    
    # Treatment events
    treatment_events = get_treatment_events(df_clean)
    
    # Performing matching
    all_matches = []
    for _, event in treatment_events.iterrows():
        matches = match_for_treatment_event(
            df_clean,
            event['region'],
            event['treatment_year'],
            features
        )
        if matches is not None:
            all_matches.append(matches)
    
    if all_matches:
        matched_df = pd.concat(all_matches, ignore_index=True)
        matched_df['match_id'] = (matched_df['treated_region'].astype(str) + '_' + 
                                 matched_df['treatment_year'].astype(str))
        return matched_df
    else:
        return None

In [None]:
# Read data
df = pd.read_csv('../data/csv/PAM_MET_PROD_pivoted.csv')

# Perform matching
matched_df = perform_psm_matching(df)

if matched_df is not None:
    print("\nMatching Summary:")
    print(f"Total treatment events: {matched_df['treated_region'].nunique()}")
    print(f"Total matched pairs: {len(matched_df[matched_df['matched_control'] == 1])}")
    
    # Save matched dataset for DiD analysis
    matched_df.to_csv('../data/matched_data_for_did.csv', index=False)
    
    # Print matching structure
    print("\nMatching Structure:")
    matching_summary = matched_df.groupby('match_id').agg({
        'matched_control': 'sum',
        'id_microrregiao': 'count'
    }).reset_index()
    print(matching_summary.head())
else:
    print("No valid matches found.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treated_region'] = treated_region
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treatment_year'] = treatment_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['matched_control'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using


Matching Summary:
Total treatment events: 197
Total matched pairs: 485

Matching Structure:
     match_id  matched_control  id_microrregiao
0  11001_2008                6                7
1  11003_2009                2                3
2  11006_2009                3                4
3  11007_2009                2                3
4  12002_2010                1                2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treated_region'] = treated_region
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treatment_year'] = treatment_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['matched_control'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [20]:
treated_production = matched_df[matched_df['treatment'] == 1]['total_valor_producao'].mean()
control_production = matched_df[matched_df['treatment'] == 0]['total_valor_producao'].mean()

print(f"Number of matched pairs: {len(matched_df) // 2}")
print(f"Average production value (treated): {treated_production:.4f}")
print(f"Average production value (control): {control_production:.4f}")

Number of matched pairs: 113
Average production value (treated): 0.9867
Average production value (control): 0.9631


In [17]:
matched_df[matched_df['treatment'] == 1]

Unnamed: 0,id_microrregiao,ano,total_valor_producao,id_microrregiao_1,year,avg_precipitacao_jan,avg_precipitacao_feb,avg_precipitacao_mar,avg_precipitacao_apr,avg_precipitacao_may,...,avg_vento,year_effect,region_trend,current_treatment,treated_region,treatment_year,matched_control,propensity_score,distance,match_id
0,11006,2009,1.000000,11006,2009,0.443935,0.562985,0.351613,0.355833,0.047376,...,1.323533,0.980141,1,1,11006,2009,0,,,11006_2009
3,11007,2009,1.000000,11007,2009,0.336474,0.381194,0.404301,0.224722,0.191935,...,1.803722,0.980141,1,1,11007,2009,0,,,11007_2009
7,12005,2010,1.000000,12005,2010,0.688926,0.419174,0.262857,0.208181,0.034677,...,1.063206,0.990764,1,1,12005,2010,0,,,12005_2010
10,13004,2013,0.857143,13004,2013,0.325510,0.483821,0.565594,0.263504,0.327014,...,0.507789,0.985307,1,1,13004,2013,0,,,13004_2013
12,13011,2009,1.000000,13011,2009,0.218182,0.431970,0.388537,0.426202,0.209942,...,1.354927,0.980141,1,1,13011,2009,0,,,13011_2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,52003,2008,1.000000,52003,2008,0.514516,0.278902,0.344461,0.287222,0.005384,...,1.132927,0.984947,1,1,52003,2008,0,,,52003_2008
214,52005,2008,1.000000,52005,2008,0.307796,0.330058,0.315470,0.288472,0.002153,...,2.007400,0.984947,1,1,52005,2008,0,,,52005_2008
216,52014,2009,1.000000,52014,2009,0.258950,0.349851,0.277957,0.188611,0.024462,...,1.392400,0.980141,1,1,52014,2009,0,,,52014_2009
220,52016,2008,1.000000,52016,2008,0.400000,0.513873,0.419822,0.293889,0.007537,...,1.277346,0.984947,1,1,52016,2008,0,,,52016_2008


In [19]:
un_treated_production = df[df['treatment'] == 1]['total_valor_producao'].mean()
un_control_production = df[df['treatment'] == 0]['total_valor_producao'].mean()

print(f"Number of matched pairs: {len(df) // 2}")
print(f"Average production value (treated): {un_treated_production:.4f}")
print(f"Average production value (control): {un_control_production:.4f}")

Number of matched pairs: 2978
Average production value (treated): 0.9856
Average production value (control): 0.9923
