In [64]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from datetime import datetime as dt

In [84]:
years = [i.name for i in os.scandir(f"inmet-data") if i.is_dir()]
location = "A801"

filenames = []
for year in years[1:]:
    filenames.extend(
        [
            f"{year}/{file.name}"
            for file in os.scandir(f"inmet-data/{year}")
            if location in file.name
        ]
    )
dfs = []
for f in filenames:
    df = pd.read_csv(f"inmet-data/{f}", delimiter=";", header=8, encoding="iso-8859-1")
    dfs.append(
        df.rename(
            columns={
                "HORA (UTC)": "Hora UTC",
                "DATA (YYYY-MM-DD)": "Data",
                "RADIACAO GLOBAL (Kj/m²)": "RADIACAO GLOBAL (KJ/m²)",
            }
        )
    )

raw_data = pd.concat(dfs).reset_index()
raw_data

Unnamed: 0,index,Data,Hora UTC,"PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),RADIACAO GLOBAL (KJ/m²),"TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)",TEMPERATURA DO PONTO DE ORVALHO (°C),...,TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C),TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C),UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),"UMIDADE RELATIVA DO AR, HORARIA (%)","VENTO, DIREÇÃO HORARIA (gr) (° (gr))","VENTO, RAJADA MAXIMA (m/s)","VENTO, VELOCIDADE HORARIA (m/s)",Unnamed: 19
0,0,2001-01-01,00:00,0,1009,1009,10085,-9999,21,158,...,209,161,156,72.0,69.0,72.0,90.0,58,21,
1,1,2001-01-01,01:00,0,10094,10094,1009,-9999,205,156,...,205,158,155,73.0,71.0,73.0,109.0,53,27,
2,2,2001-01-01,02:00,0,10091,10093,10091,-9999,20,155,...,20,156,155,75.0,73.0,75.0,96.0,42,18,
3,3,2001-01-01,03:00,0,10088,10091,10088,-9999,199,155,...,198,156,155,76.0,75.0,76.0,110.0,43,16,
4,4,2001-01-01,04:00,0,10087,10089,10087,-9999,194,156,...,194,156,155,78.0,76.0,78.0,136.0,42,22,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200371,8779,2004-12-31,19:00,0,10094,10097,10094,2788,297,177,...,292,183,167,51.0,45.0,49.0,146.0,74,24,
200372,8780,2004-12-31,20:00,0,10092,10094,10092,2089,282,186,...,281,189,177,56.0,49.0,56.0,140.0,85,34,
200373,8781,2004-12-31,21:00,0,10093,10093,10091,1293,271,174,...,271,188,174,58.0,55.0,55.0,131.0,87,37,
200374,8782,2004-12-31,22:00,0,10097,10097,10093,415,254,182,...,254,184,171,64.0,55.0,64.0,133.0,76,33,


In [93]:
cols = raw_data.columns

# treating hour data bc it's not super consistent
raw_data[cols[2]] = raw_data[cols[2]].apply(
    lambda s: ":".join([s[:2], s[2:4]]) if isinstance(s, str) and "UTC" in s else s
)
# same for dates
raw_data[cols[1]] = raw_data[cols[1]].apply(
    lambda s: s.replace("/", "-") if isinstance(s, str) else s
)


raw_data["datetime"] = pd.to_datetime(
    raw_data[cols[1]] + " " + raw_data[cols[2]], format="%Y-%m-%d %H:%M"
)


def parseToFloat(col):
    return (
        raw_data[col]
        .apply(lambda x: float(x.replace(",", ".")) if (isinstance(x, str)) else x)
        .apply(lambda x: np.nan if x == -9999.0 else x)
    )


# global-radiation
raw_data["global-radiation"] = parseToFloat(cols[7])

# precipitation
raw_data["total-precipitation"] = parseToFloat(cols[3])

raw_data["relative-humidity"] = parseToFloat("UMIDADE RELATIVA DO AR, HORARIA (%)")

# temperature
raw_data["temperature-last-hour-max"] = parseToFloat(
    "TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)"
)
raw_data["temperature-last-hour-min"] = parseToFloat(
    "TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)"
)

# wind
raw_data["wind-speed"] = parseToFloat("VENTO, VELOCIDADE HORARIA (m/s)")
raw_data["wind-speed-max-gust"] = parseToFloat("VENTO, RAJADA MAXIMA (m/s)")

raw_data
# removing unused columns and renaming the remainder to more friendly names
renamed_data = raw_data.drop(columns=cols[:21])
renamed_data

Unnamed: 0,datetime,global-radiation,total-precipitation,relative-humidity,temperature-last-hour-max,temperature-last-hour-min,wind-speed,wind-speed-max-gust
0,2001-01-01 00:00:00,,0.0,72.0,21.9,20.9,2.1,5.8
1,2001-01-01 01:00:00,,0.0,73.0,20.9,20.5,2.7,5.3
2,2001-01-01 02:00:00,,0.0,75.0,20.5,20.0,1.8,4.2
3,2001-01-01 03:00:00,,0.0,76.0,20.0,19.8,1.6,4.3
4,2001-01-01 04:00:00,,0.0,78.0,19.8,19.4,2.2,4.2
...,...,...,...,...,...,...,...,...
200371,2004-12-31 19:00:00,2788.0,0.0,49.0,29.8,29.2,2.4,7.4
200372,2004-12-31 20:00:00,2089.0,0.0,56.0,29.7,28.1,3.4,8.5
200373,2004-12-31 21:00:00,1293.0,0.0,55.0,28.4,27.1,3.7,8.7
200374,2004-12-31 22:00:00,415.0,0.0,64.0,27.0,25.4,3.3,7.6


In [None]:
train = renamed_data[renamed_data.datetime < pd.to_datetime("2007", format="%Y")]
test = renamed_data[renamed_data.datetime >= pd.to_datetime("2007", format="%Y")]

# plt.plot(train, color='blue')
# plt.plot(test, color='orange')

In [None]:
by_month = renamed_data
by_month["month"] = by_month["date"].apply(
    lambda d: dt.strftime(dt.strptime(d, "%Y/%m/%d"), "%b-%y")
)
monthly_radiation = by_month.groupby("month")["global-radiation"].transform("mean")
monthly_precipitation = by_month.groupby("month")["total-precipitation"].transform(
    "mean"
)
monthly_wind = by_month.groupby("month")["wind-speed"].transform("mean")
months = by_month["month"]

fig, ax1 = plt.subplots()


ax1.set_xlabel("month")
ax1.tick_params()
ax1.set_ylabel("radiation", color="orange")
ax1.plot(months, monthly_radiation, color="orange")
ax1.tick_params(axis="y", color="orange")

# ax2= ax1.twinx()
# ax2.set_ylabel('precipitation')
# ax2.plot(months, monthly_precipitation)
# ax2.tick_params(axis='y')

# ax3 = ax1.twinx()
# ax3.set_ylabel('wind', color='green')
# ax3.plot(months, monthly_wind, color='green')
# ax3.tick_params(axis='y', color='green')

plt.show()

KeyError: 'date'

In [None]:
print(train.size)
train.value_counts()

736176


PRECIPITAÇÃO TOTAL, HORÁRIO (mm)  RADIACAO GLOBAL (KJ/m²)  TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)  TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)  relative-humidity  wind-direction  wind-speed-max-gust  VENTO, VELOCIDADE HORARIA (m/s)  datetime             global-radiation  total-precipitation  temperature-last-hour-max  temperature-last-hour-min  wind-speed
9,8                               34                       23,7                                        22,1                                        94                  56             7,1                  1,2                              2003-12-08 20:00:00   34.0             9.8                   23.7                       22.1                      1.2           1
,2                                -9999                    -9999                                       -9999                                       55                 -9999           -9999                6,1                              2001-06-05 15:00:00  -9999.0         