In [3]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from datetime import datetime as dt

In [4]:
years = [i for i in range(2001, 2008)]
location = "A801"

filenames = []
for year in years:
    filenames.extend(
        [
            f"{year}/{file.name}"
            for file in os.scandir(f"inmet-data/{year}")
            if location in file.name
        ]
    )
dfs = [
    pd.read_csv(f"inmet-data/{f}", delimiter=";", header=8, encoding="iso-8859-1")
    for f in filenames
]

raw_data = pd.concat(dfs).reset_index()
raw_data

Unnamed: 0,index,DATA (YYYY-MM-DD),HORA (UTC),"PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),RADIACAO GLOBAL (KJ/m²),"TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)",TEMPERATURA DO PONTO DE ORVALHO (°C),...,TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C),UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),"UMIDADE RELATIVA DO AR, HORARIA (%)","VENTO, DIREÇÃO HORARIA (gr) (° (gr))","VENTO, RAJADA MAXIMA (m/s)","VENTO, VELOCIDADE HORARIA (m/s)",Unnamed: 19
0,0,2001-01-01,00:00,0,1009,1009,10085,-9999,21,158,...,209,161,156,72,69,72,90,58,21,
1,1,2001-01-01,01:00,0,10094,10094,1009,-9999,205,156,...,205,158,155,73,71,73,109,53,27,
2,2,2001-01-01,02:00,0,10091,10093,10091,-9999,20,155,...,20,156,155,75,73,75,96,42,18,
3,3,2001-01-01,03:00,0,10088,10091,10088,-9999,199,155,...,198,156,155,76,75,76,110,43,16,
4,4,2001-01-01,04:00,0,10087,10089,10087,-9999,194,156,...,194,156,155,78,76,78,136,42,22,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61339,8755,2007-12-31,19:00,0,10049,10056,10049,27651,314,179,...,309,209,179,54,45,45,152,6,21,
61340,8756,2007-12-31,20:00,0,10047,1005,10047,20127,313,185,...,305,19,162,48,40,47,153,71,25,
61341,8757,2007-12-31,21:00,0,10047,10047,10045,13076,301,183,...,301,191,176,52,44,49,141,64,21,
61342,8758,2007-12-31,22:00,0,10052,10052,10047,3291,277,197,...,277,199,184,62,49,62,134,62,28,


In [34]:
cols = raw_data.columns
raw_data["datetime"] = pd.to_datetime(
    raw_data[cols[1]] + " " + raw_data[cols[2]], format="%Y-%m-%d %H:%M"
)


def parseToFloat(col):
    return (
        raw_data[col]
        .apply(lambda x: float(x.replace(",", ".")) if (isinstance(x, str)) else x)
        .apply(lambda x: np.nan if x == -9999.0 else x)
    )


# global-radiation
raw_data["global-radiation"] = parseToFloat("RADIACAO GLOBAL (KJ/m²)")

# precipitation
raw_data["total-precipitation"] = parseToFloat("PRECIPITAÇÃO TOTAL, HORÁRIO (mm)")

raw_data["relative-humidity"] = parseToFloat("UMIDADE RELATIVA DO AR, HORARIA (%)")

# temperature
raw_data["temperature-last-hour-max"] = parseToFloat(
    "TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)"
)
raw_data["temperature-last-hour-min"] = parseToFloat(
    "TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)"
)

# wind
raw_data["wind-speed"] = parseToFloat("VENTO, VELOCIDADE HORARIA (m/s)")
raw_data["wind-speed-max-gust"] = parseToFloat("VENTO, RAJADA MAXIMA (m/s)")

raw_data
# removing unused columns and renaming the remainder to more friendly names
renamed_data = raw_data.drop(columns=cols[:21])
renamed_data

Unnamed: 0,datetime,global-radiation,total-precipitation,temperature-last-hour-max,temperature-last-hour-min,wind-speed,relative-humidity,wind-speed-max-gust
0,2001-01-01 00:00:00,,0.0,21.9,20.9,2.1,72.0,5.8
1,2001-01-01 01:00:00,,0.0,20.9,20.5,2.7,73.0,5.3
2,2001-01-01 02:00:00,,0.0,20.5,20.0,1.8,75.0,4.2
3,2001-01-01 03:00:00,,0.0,20.0,19.8,1.6,76.0,4.3
4,2001-01-01 04:00:00,,0.0,19.8,19.4,2.2,78.0,4.2
...,...,...,...,...,...,...,...,...
61339,2007-12-31 19:00:00,2765.1,0.0,31.8,30.9,2.1,45.0,6.0
61340,2007-12-31 20:00:00,2012.7,0.0,31.4,30.5,2.5,47.0,7.1
61341,2007-12-31 21:00:00,1307.6,0.0,31.3,30.1,2.1,49.0,6.4
61342,2007-12-31 22:00:00,329.1,0.0,30.1,27.7,2.8,62.0,6.2


In [15]:
train = renamed_data[renamed_data.datetime < pd.to_datetime("2007", format="%Y")]
test = renamed_data[renamed_data.datetime >= pd.to_datetime("2007", format="%Y")]

# plt.plot(train, color='blue')
# plt.plot(test, color='orange')

In [7]:
by_month = renamed_data
by_month["month"] = by_month["date"].apply(
    lambda d: dt.strftime(dt.strptime(d, "%Y/%m/%d"), "%b-%y")
)
monthly_radiation = by_month.groupby("month")["global-radiation"].transform("mean")
monthly_precipitation = by_month.groupby("month")["total-precipitation"].transform(
    "mean"
)
monthly_wind = by_month.groupby("month")["wind-speed"].transform("mean")
months = by_month["month"]

fig, ax1 = plt.subplots()


ax1.set_xlabel("month")
ax1.tick_params()
ax1.set_ylabel("radiation", color="orange")
ax1.plot(months, monthly_radiation, color="orange")
ax1.tick_params(axis="y", color="orange")

# ax2= ax1.twinx()
# ax2.set_ylabel('precipitation')
# ax2.plot(months, monthly_precipitation)
# ax2.tick_params(axis='y')

# ax3 = ax1.twinx()
# ax3.set_ylabel('wind', color='green')
# ax3.plot(months, monthly_wind, color='green')
# ax3.tick_params(axis='y', color='green')

plt.show()

KeyError: 'date'

In [16]:
print(train.size)
train.value_counts()

736176


PRECIPITAÇÃO TOTAL, HORÁRIO (mm)  RADIACAO GLOBAL (KJ/m²)  TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)  TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)  relative-humidity  wind-direction  wind-speed-max-gust  VENTO, VELOCIDADE HORARIA (m/s)  datetime             global-radiation  total-precipitation  temperature-last-hour-max  temperature-last-hour-min  wind-speed
9,8                               34                       23,7                                        22,1                                        94                  56             7,1                  1,2                              2003-12-08 20:00:00   34.0             9.8                   23.7                       22.1                      1.2           1
,2                                -9999                    -9999                                       -9999                                       55                 -9999           -9999                6,1                              2001-06-05 15:00:00  -9999.0         