In [28]:
from google.colab import drive
drive.mount('/content/drive')

workdir = "/content/drive/MyDrive/Colab Notebooks/TG/src"

# DEFININDO O DIRETÓRIO ATUAL COMO DIRETÓRIO DE REFERÊNCIA

import os
os.chdir(workdir)
os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Colab Notebooks/TG/src'

In [29]:
!pip install pandas seaborn plotly folium sidetable prophet statsmodels



# IMPORTANDO AS BIBLIOTECAS

In [30]:
import datetime
import os
from pathlib import Path
from inspect import stack

import pandas as pd
import sidetable
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# DEFININDO OPÇÕES GLOBAIS

In [31]:
pd.set_option('display.max_columns', None)

In [32]:
plt.figure(figsize=(12, 8))

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [33]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# FUNÇÕES ÚTEIS

In [34]:
def get_files_directory(path_dir, specific_type=None):

    """

        FUNÇÃO PARA OBTER ARQUIVOS DE UM DIRETÓRIO.

        É POSSÍVEL ENVIAR UM FORMATO ESPECÍFICO PARA
        FILTRO DO FORMATO DE ARQUIVO DESEJADO.
        EX: OBTER APENAS XLSX

        # Arguments
            path_dir                   - Required : Diretório analisado (String)
            specific_type              - Optional : Lista com os formatos desejados (List)

        # Returns
            list_files                 - Required : Arquivos do diretório (List)

    """

    # INICIANDO A VARIÁVEL QUE ARMAZENARÁ TODOS OS ARQUIVOS DO DIRETÓRIO
    list_files = []

    # OBTENDO TODOS OS ARQUIVOS
    try:

        # VERIFICANDO SE É DIRETÓRIO
        if os.path.isdir(path_dir):

            # OBTENDO TODOS OS ARQUIVOS EXISTENTES NO ARQUIVO
            list_files = [os.path.join(path_dir, name) for name in os.listdir(path_dir)]

            # VERIFICANDO SE UMA EXTENSÃO ESPECÍFICA FOI DEFINIDA
            if specific_type:

              # VERIFICANDO SE A EXTENSÃO É DO TIPO TUPLE/LIST
              if not isinstance(specific_type, (tuple, list)):
                  specific_type = [specific_type]

              print("FILTRANDO PARA AS EXTENSÕES: {}".format(specific_type))

              # FILTRANDO OS VALORES DA LISTA
              list_files = [arq for arq in list_files if (Path(arq).suffix in (specific_type) or str(Path(arq).suffix).replace(".", "") in (specific_type))]

        else:
            list_files = [path_dir]

    except Exception as ex:
        print("ERRO NA FUNÇÃO: {} - {}".format(stack()[0][3], ex))

    return list_files

In [35]:
def read_file(dir_name_file, csv_separator=","):

    """

      FUNÇÃO PARA LER UM ARQUIVO CSV OU EXCEL (XLS OU XLSX)
      COM OUTPUT EM FORMATO DATAFRAME

      # Arguments
          dir_name_file          - Required : Arquivo a ser lido
                                              (com o seu caminho fornecido) (Path | String)
          csv_separator          - Optional : Separador usado para caso de csv file (String)
      # Returns
          data                   - Required : Dados obtidos (DataFrame)


    """

    # INICIANDO A VARIÁVEL DE RETORNO
    data = pd.DataFrame()

    try:
      # VERIFICA SE É O ARQUIVO É CSV
      if dir_name_file.endswith('.csv'):
          data = pd.read_csv(dir_name_file, sep=csv_separator)

      # VERIFICA SE É O ARQUIVO É EXCEL
      elif dir_name_file.endswith(('.xls', '.xlsx')):
          data = pd.read_excel(dir_name_file)

      # PARA CASO NÃO CSV E NEM EXCEL, RETORNA NONE
      else:
          print("Formato de arquivo não suportado.")
          return None
    except Exception as ex:
        print("ERRO NA FUNÇÃO: {} - {}".format(stack()[0][3], ex))

    return data

# VARIÁVEIS GLOBAIS

In [36]:
# DIRETÓRIO ONDE ESTÃO OS DADOS
data_dir = 'data/4_RESULTS_DATASET_GHCN_DAILY/DATASET_GHCN_PROCESING_FREQ_DAILY.csv'

# FORMATO DESEJADO
specific_type = "csv"

In [37]:
# NOME DA COLUNA QUE CONTÉM AS INFORMAÇÕES DAS ESTAÇÕES CLIMÁTICAS
name_column_location = "name"

# OBTENDO OS DADOS DO GHCN-DAILY - RESULTADOS DO PRÉ PROCESSAMENTO

> Global Historical Climatology Network - Daily, Version 4
(NOAA GHCN v4)

Esses dados são obtidos pela pipeline construida em:

https://github.com/emersonrafaels/tg_series_temporais_otimizacao_gerenciamento_energia/blob/main/src/1_PROCESSING_GHCN_Daily.ipynb

In [38]:
df = read_file(dir_name_file=data_dir)


Columns (15) have mixed types. Specify dtype option on import or set low_memory=False.



In [39]:
df

Unnamed: 0,measurement date,stn,year,month,day,name,country,lat,lon,begin,end,prcp,wdsp,temp,state,city
0,1942-10-01,821930.0,1942.0,10.0,1.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.8,26.055556,Pará,
1,1942-10-02,821930.0,1942.0,10.0,2.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,6.2,26.333333,Pará,
2,1942-10-03,821930.0,1942.0,10.0,3.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,9.2,26.444444,Pará,
3,1942-10-04,821930.0,1942.0,10.0,4.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,7.0,26.222222,Pará,
4,1942-10-05,821930.0,1942.0,10.0,5.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.3,25.777778,Pará,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4830511,2023-07-21,831030.0,2023.0,7.0,21.0,CAMPOS DOS GOITACAZES,BR,-21.700,-41.300,20201204.0,20210917.0,0.00,5.6,21.888889,Rio de Janeiro,
4830512,2023-07-22,831030.0,2023.0,7.0,22.0,CAMPOS DOS GOITACAZES,BR,-21.700,-41.300,20201204.0,20210917.0,99.99,4.8,21.666667,Rio de Janeiro,
4830513,2023-07-23,831030.0,2023.0,7.0,23.0,CAMPOS DOS GOITACAZES,BR,-21.700,-41.300,20201204.0,20210917.0,99.99,10.8,22.055556,Rio de Janeiro,
4830514,2023-07-24,831030.0,2023.0,7.0,24.0,CAMPOS DOS GOITACAZES,BR,-21.700,-41.300,20201204.0,20210917.0,0.00,6.8,23.944444,Rio de Janeiro,


In [40]:
df.head()

Unnamed: 0,measurement date,stn,year,month,day,name,country,lat,lon,begin,end,prcp,wdsp,temp,state,city
0,1942-10-01,821930.0,1942.0,10.0,1.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.8,26.055556,Pará,
1,1942-10-02,821930.0,1942.0,10.0,2.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.0,6.2,26.333333,Pará,
2,1942-10-03,821930.0,1942.0,10.0,3.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.0,9.2,26.444444,Pará,
3,1942-10-04,821930.0,1942.0,10.0,4.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.0,7.0,26.222222,Pará,
4,1942-10-05,821930.0,1942.0,10.0,5.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.3,25.777778,Pará,


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4830516 entries, 0 to 4830515
Data columns (total 16 columns):
 #   Column            Dtype  
---  ------            -----  
 0   measurement date  object 
 1   stn               float64
 2   year              float64
 3   month             float64
 4   day               float64
 5   name              object 
 6   country           object 
 7   lat               float64
 8   lon               float64
 9   begin             float64
 10  end               float64
 11  prcp              float64
 12  wdsp              float64
 13  temp              float64
 14  state             object 
 15  city              object 
dtypes: float64(11), object(5)
memory usage: 589.7+ MB


In [42]:
df.describe()

Unnamed: 0,stn,year,month,day,lat,lon,begin,end,prcp,wdsp,temp
count,4830516.0,4830516.0,4830516.0,4830516.0,4830516.0,4830516.0,4830516.0,4830516.0,4830516.0,4802335.0,4830516.0
mean,836635.8,1999.223,6.664875,15.54989,-14.97086,-48.07132,19749280.0,20191720.0,7.429678,4.874844,24.5253
std,15014.94,17.16395,3.375045,9.11439,8.929591,8.00661,294507.0,59035.58,26.0829,3.45084,4.277547
min,749024.0,1942.0,1.0,1.0,-33.75,-72.787,19310100.0,19450920.0,0.0,0.0,-7.833333
25%,825950.0,1988.0,4.0,8.0,-22.345,-52.35,19430800.0,20210510.0,0.0,2.5,22.16667
50%,834280.0,1997.0,7.0,15.0,-15.653,-47.767,19730100.0,20210920.0,0.0,4.2,25.27778
75%,838310.0,2017.0,9.0,24.0,-7.1,-42.093,19950400.0,20210920.0,0.0,6.7,27.5
max,869980.0,2023.0,12.0,31.0,3.867,-29.317,20201200.0,20210920.0,99.99,52.4,41.38889


In [43]:
df.isnull().sum()

measurement date          0
stn                       0
year                      0
month                     0
day                       0
name                      0
country                   0
lat                       0
lon                       0
begin                     0
end                       0
prcp                      0
wdsp                  28181
temp                      0
state                 17327
city                2677344
dtype: int64

In [44]:
df["name"].value_counts()

VAL DE CANS INTL                 29619
PINTO MARTINS INTL               29421
MARECHAL CUNHA MACHADO INTL      29407
PARNAIBA PREFEITO DR JOAO SIL    29396
AUGUSTO SEVERO                   29315
                                 ...  
PADRE RICARDO REMETTER               1
ITAPERUNA                            1
TRIUNFO                              1
GUARAPUAVA                           1
CANARANA                             1
Name: name, Length: 652, dtype: int64

# ANALISANDO A SÉRIE TEMPORAL

## ANALISANDO UMA ESTAÇÃO

In [45]:
df_unique_station = df[df["name"] == "VAL DE CANS INTL"]

In [46]:
df_unique_station

Unnamed: 0,measurement date,stn,year,month,day,name,country,lat,lon,begin,end,prcp,wdsp,temp,state,city
0,1942-10-01,821930.0,1942.0,10.0,1.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.8,26.055556,Pará,
1,1942-10-02,821930.0,1942.0,10.0,2.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,6.2,26.333333,Pará,
2,1942-10-03,821930.0,1942.0,10.0,3.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,9.2,26.444444,Pará,
3,1942-10-04,821930.0,1942.0,10.0,4.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,7.0,26.222222,Pará,
4,1942-10-05,821930.0,1942.0,10.0,5.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,99.99,5.3,25.777778,Pará,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29614,2023-10-30,821930.0,2023.0,10.0,30.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,7.5,28.555556,Pará,
29615,2023-10-31,821930.0,2023.0,10.0,31.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,5.9,28.500000,Pará,
29616,2023-11-01,821930.0,2023.0,11.0,1.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,8.5,28.666667,Pará,
29617,2023-11-02,821930.0,2023.0,11.0,2.0,VAL DE CANS INTL,BR,-1.379,-48.476,19421001.0,20210920.0,0.00,7.6,29.000000,Pará,


## ANALISANDO A QUANTIDADE DE NULOS DO CONJUNTO DE DADOS DA ESTAÇÃO

In [47]:
df_unique_station.isnull().sum()

measurement date        0
stn                     0
year                    0
month                   0
day                     0
name                    0
country                 0
lat                     0
lon                     0
begin                   0
end                     0
prcp                    0
wdsp                    0
temp                    0
state                   0
city                29619
dtype: int64

## PLOTANDO A SÉRIE TEMPORAL

In [48]:
# PLOTANDO A SÉRIE TEMPORAL
fig = px.line(df_unique_station, x="measurement date", y="temp")

# ATUALIZANDO TÍTULO E LEGENDA
fig.update_layout(
    title="Série temporal - Estação Metereológica: {}".format(df_unique_station["name"].unique()[0]),
    xaxis_title="Data (measurement date)",
    yaxis_title="Temperatura (temp)",
    legend_title="Legenda",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

# INCLUINDO O TIMERANGER
fig.update_xaxes(rangeslider_visible=True)

# MOSTRANDO O GRÁFICO
fig.show()

In [49]:
# PLOTANDO A SÉRIE TEMPORAL
fig = px.line(df_unique_station, x="measurement date", y="prcp")

# ATUALIZANDO TÍTULO E LEGENDA
fig.update_layout(
    title="Série temporal - Estação Metereológica: {}".format(df_unique_station["name"].unique()[0]),
    xaxis_title="Data (measurement date)",
    yaxis_title="Precipitação (prcp)",
    legend_title="Legenda",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

# INCLUINDO O TIMERANGER
fig.update_xaxes(rangeslider_visible=True)

# MOSTRANDO O GRÁFICO
fig.show()

In [50]:
# PLOTANDO A SÉRIE TEMPORAL
fig = px.line(df_unique_station, x="measurement date", y="wdsp")

# ATUALIZANDO TÍTULO E LEGENDA
fig.update_layout(
    title="Série temporal - Estação Metereológica: {}".format(df_unique_station["name"].unique()[0]),
    xaxis_title="Data (measurement date)",
    yaxis_title="Velocidade do vento (wdsp)",
    legend_title="Legenda",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

# INCLUINDO O TIMERANGER
fig.update_xaxes(rangeslider_visible=True)

# MOSTRANDO O GRÁFICO
fig.show()

# WHITE NOISE

In [51]:
# VERIFICANDO SE HÁ ALGUM CASO DE WHITE NOISE