# Análise Exploratório dos dados
**Author:** Daniel Cavalli <br>
**Last Update:** 2024-11-08

In [None]:
# Imports
import pandas as pd

Unnamed: 0,id_microrregiao,ano,total_valor_producao,year,month,avg_precipitacao_total,avg_pressao_atm_hora,avg_temperatura_bulbo,avg_umidade_rel,avg_vento_velocidade,treatment
0,11001,2007,73459.0,2007,7,0.040512,1002.59339,25.182516,73.292111,1.368657,0
1,11001,2007,73459.0,2007,8,0.00561,1001.498878,26.640533,64.57784,1.446844,0
2,11001,2007,73459.0,2007,9,0.057507,1001.452408,27.319263,70.61898,1.336827,0
3,11001,2007,73459.0,2007,10,0.243514,999.734865,26.342432,78.744595,1.384595,0
4,11001,2007,73459.0,2007,11,0.310364,998.619188,26.094258,82.717087,1.42763,0


In [38]:
# Load the dataset
file_path = '../data/PAM_MET_SOJA.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,id_microrregiao,ano,total_valor_producao,year,month,avg_precipitacao_total,avg_pressao_atm_hora,avg_temperatura_bulbo,avg_umidade_rel,avg_vento_velocidade,treatment
0,11001,2007,2.490141,2007,7,0.040512,1002.59339,25.182516,73.292111,1.368657,0
1,11001,2007,2.490141,2007,8,0.00561,1001.498878,26.640533,64.57784,1.446844,0
2,11001,2007,2.490141,2007,9,0.057507,1001.452408,27.319263,70.61898,1.336827,0
3,11001,2007,2.490141,2007,10,0.243514,999.734865,26.342432,78.744595,1.384595,0
4,11001,2007,2.490141,2007,11,0.310364,998.619188,26.094258,82.717087,1.42763,0


## Pivot dos dados mensais das estações

In [39]:
# Pivot the dataset to have months as separate columns for each climate variable
pivoted_data = data.pivot(
    index=['id_microrregiao', 'ano', 'total_valor_producao', 'treatment'],
    columns='month',
    values=[
        'avg_precipitacao_total',
        'avg_pressao_atm_hora',
        'avg_temperatura_bulbo',
        'avg_umidade_rel',
        'avg_vento_velocidade'
    ]
)

# Flatten the multi-level column index after pivoting
pivoted_data.columns = [f'{var}_month_{month}' for var, month in pivoted_data.columns]

# Reset the index to make it a flat DataFrame
pivoted_data.reset_index(inplace=True)

# Display the transformed data
pivoted_data.head()


Unnamed: 0,id_microrregiao,ano,total_valor_producao,treatment,avg_precipitacao_total_month_1,avg_precipitacao_total_month_2,avg_precipitacao_total_month_3,avg_precipitacao_total_month_4,avg_precipitacao_total_month_5,avg_precipitacao_total_month_6,...,avg_vento_velocidade_month_3,avg_vento_velocidade_month_4,avg_vento_velocidade_month_5,avg_vento_velocidade_month_6,avg_vento_velocidade_month_7,avg_vento_velocidade_month_8,avg_vento_velocidade_month_9,avg_vento_velocidade_month_10,avg_vento_velocidade_month_11,avg_vento_velocidade_month_12
0,11001,2007,2.490141,0,,,,,,,...,,,,,1.368657,1.446844,1.336827,1.384595,1.42763,1.522463
1,11001,2008,2.589474,1,0.525784,0.352975,0.46484,0.358098,0.257568,0.009065,...,,,,,,1.445631,1.421448,1.524595,1.300279,1.377358
2,11001,2009,2.4,0,0.400812,0.611298,0.429363,0.471831,0.230041,0.109577,...,1.217175,1.250141,1.074864,1.251835,1.755925,1.082177,1.505307,1.419559,1.380972,1.340726
3,11001,2010,2.481081,0,0.024831,0.220238,0.692867,0.093333,0.11586,0.040056,...,,,,,,,,,,1.36483
4,11001,2011,2.481081,0,0.419892,0.486145,0.455586,0.295,0.090685,,...,,,,,,,,1.966667,1.39042,1.28183


In [40]:
pivoted_data.to_csv('../data/PAM_MET_SOJA_pivoted.csv', index=False)

In [36]:
pivoted_data.groupby('treatment').mean()

Unnamed: 0_level_0,id_microrregiao,ano,total_valor_producao,avg_precipitacao_total_month_1,avg_precipitacao_total_month_2,avg_precipitacao_total_month_3,avg_precipitacao_total_month_4,avg_precipitacao_total_month_5,avg_precipitacao_total_month_6,avg_precipitacao_total_month_7,...,avg_vento_velocidade_month_3,avg_vento_velocidade_month_4,avg_vento_velocidade_month_5,avg_vento_velocidade_month_6,avg_vento_velocidade_month_7,avg_vento_velocidade_month_8,avg_vento_velocidade_month_9,avg_vento_velocidade_month_10,avg_vento_velocidade_month_11,avg_vento_velocidade_month_12
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30088.05497,2014.811562,5.151864,0.23099,0.233464,0.22417,0.177144,0.135259,0.103958,0.080687,...,1.818342,1.731842,1.747831,1.841315,2.009808,2.258315,2.41859,2.424667,2.306013,2.147647
1,29631.410765,2008.915014,3.386058,0.288037,0.252083,0.239465,0.214223,0.171005,0.100518,0.08668,...,1.915381,1.83507,1.875095,1.939185,2.116074,2.380899,2.571381,2.564885,2.542059,2.316455


In [32]:
pivoted_data.loc[pivoted_data['treatment'] == 0].id_microrregiao.nunique()

374

In [33]:
pivoted_data.loc[pivoted_data['ano'] >= 2003].loc[pivoted_data['ano'] <= 2007].loc[pivoted_data['id_microrregiao'] <= 20000].to_csv('../data/pivoted_data.csv', index=False)