# Propensity Score Matching

**Author:** Daniel Cavalli <br>
**Last Update:** 2024-11-08

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
def prepare_data_for_psm(df):
    """
    Enhanced data preparation with additional features and better handling of outliers
    """
    # Create lists of column names by type
    precip_cols = [col for col in df.columns if 'precipitacao' in col]
    pressao_cols = [col for col in df.columns if 'pressao' in col]
    temp_cols = [col for col in df.columns if 'temperatura' in col]
    umidade_cols = [col for col in df.columns if 'umidade' in col]
    vento_cols = [col for col in df.columns if 'vento' in col]
    
    # Create copy of dataframe to prevent unintended changes
    df_clean = df.copy()
    
    # Calculate pre-treatment averages for production
    df_clean['avg_pre_production'] = df_clean.groupby('id_microrregiao')['total_valor_producao'].transform(
        lambda x: x.expanding().mean()
    )
    
    # Create imputer with median strategy (more robust to outliers)
    imputer = SimpleImputer(strategy='median')
    
    # Impute missing values for each group of climate variables
    for cols in [precip_cols, pressao_cols, temp_cols, umidade_cols, vento_cols]:
        if cols:  # Check if list is not empty
            df_clean[cols] = imputer.fit_transform(df_clean[cols])
    
    # Calculate yearly averages with outlier capping
    for prefix, cols in [
        ('avg_precip', precip_cols),
        ('avg_pressao', pressao_cols),
        ('avg_temp', temp_cols),
        ('avg_umidade', umidade_cols),
        ('avg_vento', vento_cols)
    ]:
        if cols:
            # Calculate mean
            df_clean[prefix] = df_clean[cols].mean(axis=1)
            
            # Cap outliers at 1st and 99th percentiles
            lower = np.percentile(df_clean[prefix], 1)
            upper = np.percentile(df_clean[prefix], 99)
            df_clean[prefix] = df_clean[prefix].clip(lower, upper)
    
    # Add year fixed effects
    df_clean['year_effect'] = df_clean.groupby('ano')['total_valor_producao'].transform('mean')
    
    # Add region-specific trends
    df_clean['region_trend'] = df_clean.groupby('id_microrregiao')['total_valor_producao'].transform(
        lambda x: np.arange(len(x))
    )
    
    df_clean.dropna(inplace=True)
    return df_clean

In [None]:
def get_treatment_events(df):
    """
    Identify all treatment events (when a region first receives treatment)
    """
    treatment_events = []
    
    # Get all treated regions
    treated_regions = df[df['treatment'] == 1]['id_microrregiao'].unique()
    
    for region in treated_regions:
        region_data = df[df['id_microrregiao'] == region].sort_values('ano')
        # Find first year of treatment
        treatment_year = region_data[region_data['treatment'] == 1]['ano'].min()
        
        treatment_events.append({
            'region': region,
            'treatment_year': treatment_year
        })
    
    return pd.DataFrame(treatment_events)

In [87]:
def match_for_treatment_event(df, treated_region, treatment_year, features, caliper=0.2):  # Reduced caliper
    """
    Enhanced matching with stricter criteria and exact matching on year
    """
    # Get data up to treatment year
    df_period = df[df['ano'] <= treatment_year].copy()
    
    # Create treatment indicator for this specific matching
    df_period['current_treatment'] = ((df_period['id_microrregiao'] == treated_region) & 
                                    (df_period['ano'] == treatment_year)).astype(int)
    
    # Get potential controls from the same year or one year before/after
    potential_controls = df_period[
        (df_period['id_microrregiao'] != treated_region) & 
        (df_period['ano'].between(treatment_year - 4, treatment_year)) &
        (df_period['treatment'] == 0)
    ]
    
    # Get treated observation
    treated_obs = df_period[df_period['current_treatment'] == 1]
    
    if len(treated_obs) == 0 or len(potential_controls) == 0:
        return None
    
    # Prepare data for propensity score calculation
    matching_data = pd.concat([treated_obs, potential_controls])
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(matching_data[features])
    
    # Calculate propensity scores with balanced class weights
    model = LogisticRegression(random_state=42, class_weight='balanced')
    model.fit(X, matching_data['current_treatment'])
    propensity_scores = model.predict_proba(X)[:, 1]
    
    # Add propensity scores to the data
    matching_data['propensity_score'] = propensity_scores
    
    # Get treated unit's propensity score
    treated_score = matching_data[matching_data['current_treatment'] == 1]['propensity_score'].iloc[0]
    
    # Calculate distances for all potential controls
    potential_controls = matching_data[matching_data['current_treatment'] == 0].copy()
    potential_controls['distance'] = abs(potential_controls['propensity_score'] - treated_score)
    
    # Apply stricter caliper
    caliper_threshold = caliper * np.std(matching_data['propensity_score'])
    potential_controls = potential_controls[potential_controls['distance'] <= caliper_threshold]
    
    # Select fewer, better matches
    matches = potential_controls.nsmallest(n=7, columns='distance')  # Reduced from 5 to 3
    
    if len(matches) == 0:
        return None
        
    # Add matching information
    matches['treated_region'] = treated_region
    matches['treatment_year'] = treatment_year
    matches['matched_control'] = 1
    
    treated_obs['treated_region'] = treated_region
    treated_obs['treatment_year'] = treatment_year
    treated_obs['matched_control'] = 0
    
    # Combine treated and matched controls
    matched_data = pd.concat([treated_obs, matches])
    
    return matched_data

In [88]:
def perform_psm_matching(df):
    """
    Enhanced PSM matching with additional features
    """
    # Preparing data
    df_clean = prepare_data_for_psm(df)
    
    # Features for matching
    features = [
        'avg_precip', 
        'avg_pressao', 
        'avg_temp', 
        'avg_umidade', 
        'avg_vento',
        'avg_pre_production',
        'year_effect',
        'region_trend'
    ]
    
    # Treatment events
    treatment_events = get_treatment_events(df_clean)
    
    # Performing matching
    all_matches = []
    for _, event in treatment_events.iterrows():
        matches = match_for_treatment_event(
            df_clean,
            event['region'],
            event['treatment_year'],
            features
        )
        if matches is not None:
            all_matches.append(matches)
    
    if all_matches:
        matched_df = pd.concat(all_matches, ignore_index=True)
        matched_df['match_id'] = (matched_df['treated_region'].astype(str) + '_' + 
                                 matched_df['treatment_year'].astype(str))
        return matched_df
    else:
        return None

In [93]:
# Read data
df = pd.read_csv('../data/csv/PAM_MET_PROD_pivoted.csv')

# Perform matching
matched_df = perform_psm_matching(df)

if matched_df is not None:
    print("\nMatching Summary:")
    print(f"Total treatment events: {matched_df['treated_region'].nunique()}")
    print(f"Total matched pairs: {len(matched_df[matched_df['matched_control'] == 1])}")
    
    # Save matched dataset for DiD analysis
    matched_df.to_csv('../data/matched_data_for_did.csv', index=False)
    
    # Print matching structure
    print("\nMatching Structure:")
    matching_summary = matched_df.groupby('match_id').agg({
        'matched_control': 'sum',
        'id_microrregiao': 'count'
    }).reset_index()
    print(matching_summary.head())
else:
    print("No valid matches found.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treated_region'] = treated_region
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['treatment_year'] = treatment_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_obs['matched_control'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using


Matching Summary:
Total treatment events: 203
Total matched pairs: 640

Matching Structure:
     match_id  matched_control  id_microrregiao
0  11001_2008                7                8
1  11003_2009                4                5
2  11006_2009                7                8
3  11007_2009                3                4
4  12004_2009                4                5


In [94]:
treated_production = matched_df[matched_df['treatment'] == 1]['total_valor_producao'].mean()
control_production = matched_df[matched_df['treatment'] == 0]['total_valor_producao'].mean()

print(f"Number of matched pairs: {len(matched_df) // 2}")
print(f"Average production value (treated): {treated_production:.2f}")
print(f"Average production value (control): {control_production:.2f}")

Number of matched pairs: 421
Average production value (treated): 2.99
Average production value (control): 3.10


In [98]:
matched_df[matched_df['treatment'] == 1]

Unnamed: 0,id_microrregiao,ano,total_valor_producao,treatment,avg_precipitacao_total_month_1,avg_precipitacao_total_month_2,avg_precipitacao_total_month_3,avg_precipitacao_total_month_4,avg_precipitacao_total_month_5,avg_precipitacao_total_month_6,...,avg_vento,year_effect,region_trend,current_treatment,treated_region,treatment_year,matched_control,propensity_score,distance,match_id
0,11001,2008,2.493684,1,0.525784,0.352975,0.464840,0.358098,0.257568,0.009065,...,1.534274,2.514360,1,1,11001,2008,0,,,11001_2008
8,11003,2009,2.028147,1,0.357790,0.649196,0.147581,0.393557,0.193540,0.115278,...,1.364851,2.405788,1,1,11003,2009,0,,,11003_2009
13,11006,2009,1.477289,1,0.443935,0.562985,0.351613,0.355833,0.047376,0.022778,...,1.323533,2.405788,1,1,11006,2009,0,,,11006_2009
21,11007,2009,1.651693,1,0.336474,0.381194,0.404301,0.224722,0.191935,0.006667,...,1.803722,2.405788,1,1,11007,2009,0,,,11007_2009
25,12004,2009,2.042874,1,0.302019,0.288525,0.387097,0.461111,0.125806,0.099167,...,1.431423,2.405788,1,1,12004,2009,0,,,12004_2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,43032,2007,1.506724,1,0.221526,0.354858,0.295007,0.108762,0.103903,0.239233,...,4.090408,2.094058,1,1,43032,2007,0,,,43032_2007
828,50001,2007,1.953165,1,0.331152,0.235593,0.055640,0.060779,0.110027,0.000278,...,2.145307,2.094058,1,1,50001,2007,0,,,50001_2007
831,50002,2007,2.238757,1,0.243876,0.247605,0.128129,0.117385,0.110961,0.000278,...,1.477211,2.094058,1,1,50002,2007,0,,,50002_2007
833,50003,2007,1.244832,1,0.329926,0.280807,0.141184,0.087900,0.074014,0.002225,...,1.965625,2.094058,1,1,50003,2007,0,,,50003_2007


In [95]:
un_treated_production = df[df['treatment'] == 1]['total_valor_producao'].mean()
un_control_production = df[df['treatment'] == 0]['total_valor_producao'].mean()

print(f"Number of matched pairs: {len(df) // 2}")
print(f"Average production value (treated): {un_treated_production:.2f}")
print(f"Average production value (control): {un_control_production:.2f}")

Number of matched pairs: 2641
Average production value (treated): 3.39
Average production value (control): 5.15


In [97]:
df

Unnamed: 0,id_microrregiao,ano,total_valor_producao,treatment,avg_precipitacao_total_month_1,avg_precipitacao_total_month_2,avg_precipitacao_total_month_3,avg_precipitacao_total_month_4,avg_precipitacao_total_month_5,avg_precipitacao_total_month_6,...,avg_vento_velocidade_month_3,avg_vento_velocidade_month_4,avg_vento_velocidade_month_5,avg_vento_velocidade_month_6,avg_vento_velocidade_month_7,avg_vento_velocidade_month_8,avg_vento_velocidade_month_9,avg_vento_velocidade_month_10,avg_vento_velocidade_month_11,avg_vento_velocidade_month_12
0,11001,2007,2.012189,0,,,,,,,...,,,,,1.368657,1.446844,1.336827,1.384595,1.427630,1.522463
1,11001,2008,2.493684,1,0.525784,0.352975,0.464840,0.358098,0.257568,0.009065,...,,,,,,1.445631,1.421448,1.524595,1.300279,1.377358
2,11001,2009,3.025559,0,0.400812,0.611298,0.429363,0.471831,0.230041,0.109577,...,1.217175,1.250141,1.074864,1.251835,1.755925,1.082177,1.505307,1.419559,1.380972,1.340726
3,11001,2010,3.138326,0,0.024831,0.220238,0.692867,0.093333,0.115860,0.040056,...,,,,,,,,,,1.364830
4,11001,2011,4.747424,0,0.419892,0.486145,0.455586,0.295000,0.090685,,...,,,,,,,,1.966667,1.390420,1.281830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5278,50007,2014,2.809208,0,0.168991,0.171577,0.259812,0.069632,0.057392,0.011528,...,1.190390,1.082557,0.958132,1.044097,1.099395,1.102554,1.525903,1.378629,1.283819,1.515524
5279,50007,2015,2.718658,0,0.142165,0.283036,0.174966,0.044409,0.145296,0.063333,...,0.980699,0.890251,0.935215,0.926944,1.079167,1.126882,1.063194,1.100269,1.069236,0.889247
5280,50007,2016,3.511356,0,0.362903,0.201149,0.178763,0.112778,0.244116,0.055139,...,0.916465,0.637847,0.902690,0.744583,0.897648,1.066056,1.111111,1.450063,1.552639,1.597581
5281,50007,2017,2.937802,0,0.357661,0.210714,0.186022,0.202500,0.164785,0.014892,...,1.213306,1.232778,0.982796,1.195964,1.373410,1.235198,1.325105,1.585517,1.407708,1.301411
