In [17]:
import pandas as pd
import numpy as np
from linearmodels import PanelOLS
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
def prepare_did_data(matched_df):
    """
    Prepare matched data for simple DiD
    """
    df = matched_df.copy()
    
    # Create post-treatment indicator
    df['post'] = (df['ano'] >= df['treatment_year']).astype(int)
    
    # Create treatment effect
    df['treat_effect'] = df['post'] * (df['matched_control'] == 0).astype(int)
    
    # Set panel index
    df = df.set_index(['id_microrregiao', 'ano'])
    
    return df

In [28]:
def run_did(df):
    """
    Run simple DiD regression
    """
    model = PanelOLS(
        dependent=df['total_valor_producao'],
        exog=sm.add_constant(df[['treat_effect']]),
        entity_effects=True,
        time_effects=True
    )
    
    results = model.fit(cov_type='clustered', cluster_entity=True)
    return results

In [29]:
matched_df = pd.read_csv('../data/matched_data_for_did.csv')
did_data = prepare_did_data(matched_df)
results = run_did(did_data)

In [30]:
print("\nDiD Results:")
print(results.summary)


DiD Results:
                           PanelOLS Estimation Summary                            
Dep. Variable:     total_valor_producao   R-squared:                        0.0065
Estimator:                     PanelOLS   R-squared (Between):             -0.0113
No. Observations:                   682   R-squared (Within):              -0.0046
Date:                  Fri, Nov 22 2024   R-squared (Overall):              0.0008
Time:                          21:15:47   Log-likelihood                   -7451.7
Cov. Estimator:               Clustered                                           
                                          F-statistic:                      2.4246
Entities:                           293   P-value                           0.1203
Avg Obs:                         2.3276   Distribution:                   F(1,370)
Min Obs:                         1.0000                                           
Max Obs:                         16.000   F-statistic (robust):          

In [40]:
data = pd.read_csv('../data/csv/dados_fixed.csv')
data.loc[data['id_microrregiao']==50010]

Unnamed: 0,id_microrregiao,ano,produtividade,total_area_plantada,total_precipitacao,precipitacao_por_area_plantada,qtd_estacoes_ativas,tratado,primeiro_ano_tratamento,pos_tratamento
0,50010,2018,950083.0,335984,13908.8,0.041397,12.0,1.0,2008.0,1
1,50010,2019,963675.0,332128,12168.2,0.036637,12.0,1.0,2008.0,1
2,50010,2020,861352.0,295244,8423.0,0.028529,12.0,1.0,2008.0,1
3,50010,2021,774928.0,276945,4793.8,0.01731,12.0,1.0,2008.0,1
9,50010,2008,810000.0,107063,5349.8,0.049969,6.0,1.0,2008.0,1
10,50010,2009,1152096.0,134286,9238.2,0.068795,6.0,1.0,2008.0,1
11,50010,2010,1318490.0,182418,6811.2,0.037338,6.0,1.0,2008.0,1
12,50010,2011,920403.0,249893,8158.2,0.032647,6.0,1.0,2008.0,1
13,50010,2012,859553.0,300702,7672.8,0.025516,6.0,1.0,2008.0,1
14,50010,2013,765325.0,349940,8132.6,0.02324,6.0,1.0,2008.0,1
