In [34]:
# Programa que faz a leitura do arquivo historico_diario.csv na camada REFINED do Data Lake e 
# converte para dados médios mensais (1981 até 2022) salvando na camada TRUSTED para serem consumidos pelo 
# modelo de Machine Learning


# Nome do programa: historico_mensal.csv
# Dado de entrada: Arquivo historico_diario.csv com dados tratados diários da radiação solar entre 1981 e 2022
# Dados de saída: Arquivo historico_mensal.csv com dados tratados de média mensal da radiação solar entre 1985 e 2022


# Importação de bibliotecas
import pandas as pd

pd.set_option('display.max_columns', 50)

pd.set_option('display.max_rows', 600)

In [35]:
# Endereço base de entrada do Data Lake
endereco_base_entrada = "SILVER/"

# Nome do arquivo que será lido com dados de entrada
nome_arquivo_entrada = f"historico_diario.csv"

#Diretório de leitura do arquivo
leitura_arquivo_entrada = endereco_base_entrada + nome_arquivo_entrada



# Endereço base de saída do Data Lake
endereco_base_saida = "GOLD/"

# Nome do arquivo que será escrito com dados de saída
nome_arquivo_saida = "historico_mensal.csv"

#Diretório de leitura do arquivo
leitura_arquivo_saida = endereco_base_saida + nome_arquivo_saida

In [36]:
# Leitura de dados de entrada
df = pd.read_csv(leitura_arquivo_entrada, sep = ",")

df.head(300)

Unnamed: 0,YEAR,MO,DY,ALLSKY_SFC_SW_DWN,CLRSKY_SFC_SW_DWN,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB,ALLSKY_SFC_UV_INDEX,T2M,T2MDEW,T2MWET,TS,T2M_RANGE,T2M_MAX,T2M_MIN
0,1981,1,1,-999.0,-999.0,-999.0,-999.0,20.28,16.48,18.38,21.19,-260.95,26.86,14.65,,,,,
1,1981,1,2,-999.0,-999.0,-999.0,-999.0,20.89,18.26,19.58,21.72,-262.0,27.13,15.99,,,,,
2,1981,1,3,-999.0,-999.0,-999.0,-999.0,22.48,19.33,20.9,22.85,-264.93,26.8,18.58,,,,,
3,1981,1,4,-999.0,-999.0,-999.0,-999.0,22.96,19.65,21.3,23.51,-264.6,27.5,18.96,,,,,
4,1981,1,5,-999.0,-999.0,-999.0,-999.0,22.3,19.83,21.07,22.51,-267.77,25.01,19.64,,,,,
5,1981,1,6,-999.0,-999.0,-999.0,-999.0,22.77,19.94,21.35,22.8,-265.81,26.83,19.49,,,,,
6,1981,1,7,-999.0,-999.0,-999.0,-999.0,22.46,19.85,21.15,22.81,-264.73,27.29,18.87,,,,,
7,1981,1,8,-999.0,-999.0,-999.0,-999.0,21.26,19.31,20.29,21.76,-265.31,26.1,18.26,,,,,
8,1981,1,9,-999.0,-999.0,-999.0,-999.0,21.15,18.85,20.0,21.75,-264.81,26.34,18.01,,,,,
9,1981,1,10,-999.0,-999.0,-999.0,-999.0,21.87,19.21,20.54,22.51,-264.52,27.14,18.51,,,,,


In [37]:
#Tratamento de dados para converter dados -999.0 para 0.0
df = df.replace(-999.0, 0.0)

In [38]:
#Cálculo da média mensal dos dados
df_2 = df.groupby(['YEAR', 'MO'])['ALLSKY_SFC_LW_DWN'].mean().reset_index()

In [39]:
df_2.head(504)

Unnamed: 0,YEAR,MO,ALLSKY_SFC_LW_DWN
0,1981,1,0.0
1,1981,2,0.0
2,1981,3,0.0
3,1981,4,0.0
4,1981,5,0.0
5,1981,6,0.0
6,1981,7,0.0
7,1981,8,0.0
8,1981,9,0.0
9,1981,10,0.0


In [40]:
df_2['date'] = pd.to_datetime(df_2['YEAR'].astype(str) + '-' + df['MO'].astype(str) + '-1')
df_2['timestamp'] = df_2['date'].dt.strftime('%Y-%m-%d')

In [41]:
df_2.head(10)

Unnamed: 0,YEAR,MO,ALLSKY_SFC_LW_DWN,date,timestamp
0,1981,1,0.0,1981-01-01,1981-01-01
1,1981,2,0.0,1981-01-01,1981-01-01
2,1981,3,0.0,1981-01-01,1981-01-01
3,1981,4,0.0,1981-01-01,1981-01-01
4,1981,5,0.0,1981-01-01,1981-01-01
5,1981,6,0.0,1981-01-01,1981-01-01
6,1981,7,0.0,1981-01-01,1981-01-01
7,1981,8,0.0,1981-01-01,1981-01-01
8,1981,9,0.0,1981-01-01,1981-01-01
9,1981,10,0.0,1981-01-01,1981-01-01


In [42]:
# Reordenar as colunas
df_3 = df_2[['timestamp', 'ALLSKY_SFC_LW_DWN']]

df_3.head(50)


Unnamed: 0,timestamp,ALLSKY_SFC_LW_DWN
0,1981-01-01,0.0
1,1981-01-01,0.0
2,1981-01-01,0.0
3,1981-01-01,0.0
4,1981-01-01,0.0
5,1981-01-01,0.0
6,1981-01-01,0.0
7,1981-01-01,0.0
8,1981-01-01,0.0
9,1981-01-01,0.0


In [43]:
# Converter a coluna "timestamp" para o tipo de dados datetime
df_3['timestamp'] = pd.to_datetime(df_3['timestamp'])

# Filtra as linhas com ano maior que 1985
df_filtered = df_3[df_3['timestamp'].dt.year > 1985]

df_filtered.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3['timestamp'] = pd.to_datetime(df_3['timestamp'])


Unnamed: 0,timestamp,ALLSKY_SFC_LW_DWN
60,1986-03-01,390.777419
61,1986-03-01,371.8
62,1986-03-01,377.367742
63,1986-03-01,368.12
64,1986-03-01,357.258065
65,1986-03-01,338.64
66,1986-03-01,328.735484
67,1986-03-01,349.848387
68,1986-03-01,353.806667
69,1986-03-01,354.603226


In [44]:
# Salvando arquivo tratado na camada TRUSTED
df_filtered.to_csv(leitura_arquivo_saida, encoding = "utf-8-sig", sep = ";", index="False")