In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./dados_estação.csv', low_memory=False)

## Fazendo a primeira visualização dos dados
---

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Estação,Mês,Mês.1,Dia,Hora,Ano,"Vel, Vento Máx, (Km/h)","Vel, Vento Média (Km/h)","Vel, Vento Vetor (Km/h)","Dir, Vento Média (graus)","Dir, Vento Desvio (graus)",Precipitação Total (mm),Temp. Ar Média (ºC),"Temp. Ar Máx, (ºC)",Temp. Ar Mínima (ºC),Um. Rel. Média (%),Um. Rel. Máxima (%),Um. Rel. Mínima (%)
0,0,Cerro do Roque,1,Janeiro,1.0,00:00:00,2020,177,95,92,663,150,0000,2003,204,1976,867,888,839
1,1,Cerro do Roque,1,Janeiro,1.0,00:30:00,2020,164,96,92,220,149,0000,0,0,0,0,0,0
2,2,Cerro do Roque,1,Janeiro,1.0,01:00:00,2020,173,112,108,150,147,0000,2048,2098,1996,815,866,7681
3,3,Cerro do Roque,1,Janeiro,1.0,01:30:00,2020,170,90,88,40,136,0000,0,0,0,0,0,0
4,4,Cerro do Roque,1,Janeiro,1.0,02:00:00,2020,136,89,84,90,175,0000,2015,2053,1982,833,853,809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470290,470290,Aroeira,12,dezembro,4.0,14:30:00,2023,3276,1423,1291,12290,2473,000,1954,1984,1913,5230,5617,4815
470291,470291,Aroeira,12,dezembro,4.0,15:00:00,2023,3337,1380,1246,12530,2528,000,1950,1990,1906,5188,5476,4902
470292,470292,Aroeira,12,dezembro,4.0,15:30:00,2023,3272,1444,1366,12860,1889,000,1952,1987,1916,5145,5486,4892
470293,470293,Aroeira,12,dezembro,4.0,16:00:00,2023,2862,1287,1198,12690,2131,000,1918,1960,1876,5208,5500,4945


In [4]:
print(len(df.columns))

19



## Visualizando os dados de cada coluna isoladamente, visando encontrar erros/outliers
---

In [5]:
for i in range(17):
    print(df.columns[i])
    print(df.iloc[:, i].unique())
    print()

Unnamed: 0
[     0      1      2 ... 470292 470293 470294]

Estação
['Cerro do Roque' 'Ramos' 'Terra Dura' 'Viveiro' 'São Sepé' 'São Manoel'
 'Aroeira']

Mês
[   1    2    3    4    7    8    9   10   11   12    5    6   13   14
   15   16   17   18   19   20   21   22   23   24   25   26   27   28
   29   30   31 2023]

Mês.1
['Janeiro' 'Fevereiro' 'Março' 'Abril' 'Julho' 'Agosto' 'Setembro'
 'Outubro' 'Novembro' 'Dezembro' 'Maio' 'Junho ' 'abril' 'maio' 'junho'
 'julho' 'agosto' 'setembro' 'outubro' 'novembro' 'dezembro' 'janeiro'
 'fevereiro' 'março' 'Junho']

Dia
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. nan]

Hora
['00:00:00' '00:30:00' '01:00:00' '01:30:00' '02:00:00' '02:30:00'
 '03:00:00' '03:30:00' '04:00:00' '04:30:00' '05:00:00' '05:30:00'
 '06:00:00' '06:30:00' '07:00:00' '07:30:00' '08:00:00' '08:30:00'
 '09:00:00' '09:30:00' '10:00:00' '10:30:00' '11:00:00' '11:30:00'
 '12:00:00' '12:30:00

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470295 entries, 0 to 470294
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 470295 non-null  int64  
 1   Estação                    470295 non-null  object 
 2   Mês                        470295 non-null  int64  
 3   Mês.1                      470295 non-null  object 
 4   Dia                        470284 non-null  float64
 5   Hora                       470295 non-null  object 
 6   Ano                        470273 non-null  object 
 7   Vel, Vento Máx, (Km/h)     464082 non-null  object 
 8   Vel, Vento Média (Km/h)    466038 non-null  object 
 9   Vel, Vento Vetor (Km/h)    457185 non-null  object 
 10  Dir, Vento Média (graus)   465590 non-null  object 
 11  Dir, Vento Desvio (graus)  459505 non-null  object 
 12  Precipitação Total (mm)    466224 non-null  object 
 13  Temp. Ar Média (ºC)        44

<br> <br>

## Corrigindo os dados da coluna "Mês"
---

In [7]:
df.drop(columns=['Unnamed: 0', 'Mês'], inplace=True)

In [8]:
df.loc[:,'Mês.1'].replace(to_replace='janeiro', value='Janeiro', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='fevereiro', value='Fevereiro', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='março', value='Março', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='abril', value='Abril', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='maio', value='Maio', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='junho', value='Junho', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='julho', value='Julho', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='agosto', value='Agosto', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='setembro', value='Setembro', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='outubro', value='Outubro', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='novembro', value='Novembro', inplace=True)
df.loc[:,'Mês.1'].replace(to_replace='dezembro', value='Dezembro', inplace=True)

<br> <br>

## Corrigindo os problemas da coluna "Dia"
---

In [9]:
df = df.loc[~df['Dia'].isna(), :]

In [10]:
df['Dia'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31.])

In [11]:
corretos = ['00:00:00','00:30:00','01:00:00','01:30:00','02:00:00','02:30:00',
            '03:00:00','03:30:00','04:00:00','04:30:00','05:00:00','05:30:00',
            '06:00:00','06:30:00','07:00:00','07:30:00','08:00:00','08:30:00',
            '09:00:00','09:30:00','10:00:00','10:30:00','11:00:00','11:30:00',
            '12:00:00','12:30:00','13:00:00','13:30:00','14:00:00','14:30:00',
            '15:00:00','15:30:00','16:00:00','16:30:00','17:00:00','17:30:00',
            '18:00:00','18:30:00','19:00:00','19:30:00','20:00:00','20:30:00',
            '21:00:00','21:30:00','22:00:00','22:30:00','23:00:00','23:30:00',
            '00:00:00','00:30:00','01:00:00','01:30:00','02:00:00','02:30:00',
            '03:00:00','03:30:00','04:00:00','04:30:00','05:00:00','05:30:00',
            '06:00:00','06:30:00','07:00:00','07:30:00','08:00:00','08:30:00',
            '09:00:00','09:30:00']

incorretos = ['00:00','00:30','01:00','01:30','02:00','02:30',
              '03:00','03:30','04:00','04:30','05:00','05:30',
              '06:00','06:30','07:00','07:30','08:00','08:30',
              '09:00','09:30','10:00','10:30','11:00','11:30',
              '12:00','12:30','13:00','13:30','14:00','14:30',
              '15:00','15:30','16:00','16:30','17:00','17:30',
              '18:00','18:30','19:00','19:30','20:00','20:30',
              '21:00','21:30','22:00','22:30','23:00','23:30',
              '0:00:00','0:30:00','1:00:00','1:30:00','2:00:00',
              '2:30:00','3:00:00','3:30:00','4:00:00','4:30:00',
              '5:00:00','5:30:00','6:00:00','6:30:00','7:00:00',
              '7:30:00','8:00:00','8:30:00','9:00:00','9:30:00']

In [12]:
for i in range(len(incorretos)):
    df.loc[:, 'Hora'].replace(to_replace=incorretos[i], value=corretos[i], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Hora'].replace(to_replace=incorretos[i], value=corretos[i], inplace=True)


<br> <br>

## Corrigindo dados da coluna "Ano"
---

In [13]:
df.loc[:, 'Ano'].unique()

array(['2020', '2021', '2022', '2023', nan, '#REF!'], dtype=object)

In [14]:
df.replace(to_replace='#REF!', value='2020', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(to_replace='#REF!', value='2020', inplace=True)


In [15]:
df[df['Ano'].isna()]

Unnamed: 0,Estação,Mês.1,Dia,Hora,Ano,"Vel, Vento Máx, (Km/h)","Vel, Vento Média (Km/h)","Vel, Vento Vetor (Km/h)","Dir, Vento Média (graus)","Dir, Vento Desvio (graus)",Precipitação Total (mm),Temp. Ar Média (ºC),"Temp. Ar Máx, (ºC)",Temp. Ar Mínima (ºC),Um. Rel. Média (%),Um. Rel. Máxima (%),Um. Rel. Mínima (%)
109261,Ramos,Maio,28.0,23:30:00,,,,,,,,,,,,,
109262,Ramos,Maio,29.0,00:00:00,,,,,,,,,,,,,
109263,Ramos,Maio,29.0,00:30:00,,,,,,,,,,,,,
109264,Ramos,Maio,29.0,01:00:00,,,,,,,,,,,,,
109265,Ramos,Maio,29.0,01:30:00,,,,,,,,,,,,,
109266,Ramos,Maio,29.0,02:00:00,,,,,,,,,,,,,
109267,Ramos,Maio,29.0,02:30:00,,,,,,,,,,,,,
109268,Ramos,Maio,29.0,03:00:00,,,,,,,,,,,,,
109269,Ramos,Maio,29.0,03:30:00,,,,,,,,,,,,,
109270,Ramos,Maio,29.0,04:00:00,,,,,,,,,,,,,


In [16]:
df = df.loc[~df['Ano'].isna(), :]

In [17]:
df

Unnamed: 0,Estação,Mês.1,Dia,Hora,Ano,"Vel, Vento Máx, (Km/h)","Vel, Vento Média (Km/h)","Vel, Vento Vetor (Km/h)","Dir, Vento Média (graus)","Dir, Vento Desvio (graus)",Precipitação Total (mm),Temp. Ar Média (ºC),"Temp. Ar Máx, (ºC)",Temp. Ar Mínima (ºC),Um. Rel. Média (%),Um. Rel. Máxima (%),Um. Rel. Mínima (%)
0,Cerro do Roque,Janeiro,1.0,00:00:00,2020,177,95,92,663,150,0000,2003,204,1976,867,888,839
1,Cerro do Roque,Janeiro,1.0,00:30:00,2020,164,96,92,220,149,0000,0,0,0,0,0,0
2,Cerro do Roque,Janeiro,1.0,01:00:00,2020,173,112,108,150,147,0000,2048,2098,1996,815,866,7681
3,Cerro do Roque,Janeiro,1.0,01:30:00,2020,170,90,88,40,136,0000,0,0,0,0,0,0
4,Cerro do Roque,Janeiro,1.0,02:00:00,2020,136,89,84,90,175,0000,2015,2053,1982,833,853,809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470290,Aroeira,Dezembro,4.0,14:30:00,2023,3276,1423,1291,12290,2473,000,1954,1984,1913,5230,5617,4815
470291,Aroeira,Dezembro,4.0,15:00:00,2023,3337,1380,1246,12530,2528,000,1950,1990,1906,5188,5476,4902
470292,Aroeira,Dezembro,4.0,15:30:00,2023,3272,1444,1366,12860,1889,000,1952,1987,1916,5145,5486,4892
470293,Aroeira,Dezembro,4.0,16:00:00,2023,2862,1287,1198,12690,2131,000,1918,1960,1876,5208,5500,4945


<br> <br>

## Corrigindo os problemas da coluna "Precipitação Total (mm)"
---

In [18]:
df['Precipitação Total (mm)'].unique()

array(['0,000', '0,254', '5,080', '9,400', '1,016', '2,794', '6,350',
       '16,510', '1,270', '1,524', '4,318', '3,302', '2,032', '0,508',
       '0,762', '8,130', '16,760', '13,210', '1,778', '2,286', '10,920',
       '2,540', '3,810', '4,826', '4,064', '10,160', '3,048', '4,572',
       '22,610', '8,890', '6,096', '7,874', '3,556', '8,380', '17,020',
       '6,604', '11,940', '35,050', '9,140', '10,410', '5,334', '19,050',
       '7,620', '12,190', '5,588', '9,650', '6,858', '9,910', '13,460',
       '5,842', '7,366', '8,640', '11,180', '0', '0,0', '6,4', '3,3',
       '1,3', '0,3', '0,5', '21,3', '0,8', '4,6', '1,5', '1,0', '7,6',
       '16,5', '3,8', '2,5', '2,8', '5,8', '3,0', '3,6', '1,8', '4,1',
       '9,7', '10,7', '5,1', '2,3', nan, '15,0', '2,0', '8,1', '5,3',
       '10,9', '5,6', '9,9', '7,9', '4,8', '8,4', '13,2', '6,1', '7,4',
       '4,3', '12,2', '6,6', '12,7', '10,4', '8,6', '11,9', '11,4',
       '10,2', '13,0', '11,2', '11,7', '17,8', '33,5', '9,1', '7,1',
      

In [19]:
df = df.loc[~df['Precipitação Total (mm)'].isna(), :]

In [20]:
df['Precipitação Total (mm)'].unique()

array(['0,000', '0,254', '5,080', '9,400', '1,016', '2,794', '6,350',
       '16,510', '1,270', '1,524', '4,318', '3,302', '2,032', '0,508',
       '0,762', '8,130', '16,760', '13,210', '1,778', '2,286', '10,920',
       '2,540', '3,810', '4,826', '4,064', '10,160', '3,048', '4,572',
       '22,610', '8,890', '6,096', '7,874', '3,556', '8,380', '17,020',
       '6,604', '11,940', '35,050', '9,140', '10,410', '5,334', '19,050',
       '7,620', '12,190', '5,588', '9,650', '6,858', '9,910', '13,460',
       '5,842', '7,366', '8,640', '11,180', '0', '0,0', '6,4', '3,3',
       '1,3', '0,3', '0,5', '21,3', '0,8', '4,6', '1,5', '1,0', '7,6',
       '16,5', '3,8', '2,5', '2,8', '5,8', '3,0', '3,6', '1,8', '4,1',
       '9,7', '10,7', '5,1', '2,3', '15,0', '2,0', '8,1', '5,3', '10,9',
       '5,6', '9,9', '7,9', '4,8', '8,4', '13,2', '6,1', '7,4', '4,3',
       '12,2', '6,6', '12,7', '10,4', '8,6', '11,9', '11,4', '10,2',
       '13,0', '11,2', '11,7', '17,8', '33,5', '9,1', '7,1', '0,25',
   

<br> <br>

## Corrigindo os problemas da coluna "Temp. Ar Média (ºC)"
---
<br>

In [21]:
df['Temp. Ar Média (ºC)'].dropna()

0         20,03
1             0
2         20,48
3             0
4         20,15
          ...  
470290    19,54
470291    19,50
470292    19,52
470293    19,18
470294    18,74
Name: Temp. Ar Média (ºC), Length: 443194, dtype: object

In [22]:
df = df.dropna()

<br> <br>
## Corrigindo os problemas da coluna "Vel, Vento Máx, Média e Vetor (Km/h)"
---
<br>

In [23]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)

In [24]:
df.loc[:, "Vel, Vento Máx, (Km/h)"].replace(to_replace=",",value=".", regex=True, inplace=True)

In [25]:
df["Vel, Vento Máx, (Km/h)"] = df["Vel, Vento Máx, (Km/h)"].astype(float)

In [26]:
df["Vel, Vento Máx, (Km/h)"] 

0         17.70
1         16.40
2         17.30
3         17.00
4         13.60
          ...  
470290    32.76
470291    33.37
470292    32.72
470293    28.62
470294    29.34
Name: Vel, Vento Máx, (Km/h), Length: 432358, dtype: float64

In [27]:
df.loc[:, "Vel, Vento Média (Km/h)"].replace(to_replace=",",value=".", regex=True, inplace=True)

In [28]:
df["Vel, Vento Média (Km/h)"] = df["Vel, Vento Média (Km/h)"].astype(float)

In [29]:
df["Vel, Vento Média (Km/h)"]

0          9.50
1          9.60
2         11.20
3          9.00
4          8.90
          ...  
470290    14.23
470291    13.80
470292    14.44
470293    12.87
470294    13.80
Name: Vel, Vento Média (Km/h), Length: 432358, dtype: float64

In [30]:
df.loc[:, "Vel, Vento Vetor (Km/h)"].replace(to_replace=",",value=".", regex=True, inplace=True)

In [31]:
df["Vel, Vento Vetor (Km/h)"] = df["Vel, Vento Vetor (Km/h)"].astype(float)

In [32]:
df["Vel, Vento Vetor (Km/h)"]

0          9.20
1          9.20
2         10.80
3          8.80
4          8.40
          ...  
470290    12.91
470291    12.46
470292    13.66
470293    11.98
470294    13.09
Name: Vel, Vento Vetor (Km/h), Length: 432358, dtype: float64

<br> <br>
## Corrigindo os problemas da coluna "Dir, Vento Média e Desvios (graus)"
---
<br>

In [33]:
df.loc[:, "Dir, Vento Média (graus)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Dir, Vento Média (graus)"] = df["Dir, Vento Média (graus)"].astype(float)

In [34]:
df["Vel, Vento Vetor (Km/h)"]

0          9.20
1          9.20
2         10.80
3          8.80
4          8.40
          ...  
470290    12.91
470291    12.46
470292    13.66
470293    11.98
470294    13.09
Name: Vel, Vento Vetor (Km/h), Length: 432358, dtype: float64

In [35]:
df.loc[:, "Dir, Vento Desvio (graus)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Dir, Vento Desvio (graus)"] = df["Dir, Vento Desvio (graus)"].astype(float)

In [36]:
df["Dir, Vento Desvio (graus)"]

0         15.00
1         14.90
2         14.70
3         13.60
4         17.50
          ...  
470290    24.73
470291    25.28
470292    18.89
470293    21.31
470294    18.45
Name: Dir, Vento Desvio (graus), Length: 432358, dtype: float64

<br> <br>
## Corrigindo os problemas da coluna "Temp. Ar Média, Máxima e Mínima (ºC)"
---
<br>

In [37]:
df.loc[:, "Temp. Ar Média (ºC)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Temp. Ar Média (ºC)"] = df["Temp. Ar Média (ºC)"].astype(float)

In [38]:
df["Temp. Ar Média (ºC)"]

0         20.03
1          0.00
2         20.48
3          0.00
4         20.15
          ...  
470290    19.54
470291    19.50
470292    19.52
470293    19.18
470294    18.74
Name: Temp. Ar Média (ºC), Length: 432358, dtype: float64

In [39]:
df.loc[:, "Temp. Ar Máx, (ºC)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Temp. Ar Máx, (ºC)"] = df["Temp. Ar Máx, (ºC)"].astype(float)

In [40]:
df["Temp. Ar Máx, (ºC)"]

0         20.40
1          0.00
2         20.98
3          0.00
4         20.53
          ...  
470290    19.84
470291    19.90
470292    19.87
470293    19.60
470294    19.23
Name: Temp. Ar Máx, (ºC), Length: 432358, dtype: float64

In [41]:
df.loc[:, "Temp. Ar  Mínima (ºC)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Temp. Ar  Mínima (ºC)"] = df["Temp. Ar  Mínima (ºC)"].astype(float)

In [42]:
df["Temp. Ar  Mínima (ºC)"]

0         19.76
1          0.00
2         19.96
3          0.00
4         19.82
          ...  
470290    19.13
470291    19.06
470292    19.16
470293    18.76
470294    18.36
Name: Temp. Ar  Mínima (ºC), Length: 432358, dtype: float64

<br> <br>
## Corrigindo os problemas da coluna "Um. Rel. Média, Máxima e Mínima (%)"
---
<br>

In [43]:
df.loc[:, "Um. Rel. Média (%)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Um. Rel. Média (%)"] = df["Um. Rel. Média (%)"].astype(float)

In [44]:
df["Um. Rel. Média (%)"]

0         86.70
1          0.00
2         81.50
3          0.00
4         83.30
          ...  
470290    52.30
470291    51.88
470292    51.45
470293    52.08
470294    53.81
Name: Um. Rel. Média (%), Length: 432358, dtype: float64

In [45]:
df.loc[:, "Um. Rel. Máxima (%)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Um. Rel. Máxima (%)"] = df["Um. Rel. Máxima (%)"].astype(float)

In [46]:
df["Um. Rel. Máxima (%)"]

0         88.80
1          0.00
2         86.60
3          0.00
4         85.30
          ...  
470290    56.17
470291    54.76
470292    54.86
470293    55.00
470294    56.27
Name: Um. Rel. Máxima (%), Length: 432358, dtype: float64

In [47]:
df.loc[:, "Um. Rel. Mínima (%)"].replace(to_replace=",",value=".", regex=True, inplace=True)
df["Um. Rel. Mínima (%)"] = df["Um. Rel. Mínima (%)"].astype(float)

In [48]:
df["Um. Rel. Mínima (%)"]

0         83.90
1          0.00
2         76.81
3          0.00
4         80.90
          ...  
470290    48.15
470291    49.02
470292    48.92
470293    49.45
470294    51.07
Name: Um. Rel. Mínima (%), Length: 432358, dtype: float64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 432358 entries, 0 to 470294
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Estação                    432358 non-null  object 
 1   Mês.1                      432358 non-null  object 
 2   Dia                        432358 non-null  float64
 3   Hora                       432358 non-null  object 
 4   Ano                        432358 non-null  object 
 5   Vel, Vento Máx, (Km/h)     432358 non-null  float64
 6   Vel, Vento Média (Km/h)    432189 non-null  float64
 7   Vel, Vento Vetor (Km/h)    432096 non-null  float64
 8   Dir, Vento Média (graus)   432358 non-null  float64
 9   Dir, Vento Desvio (graus)  432358 non-null  float64
 10  Precipitação Total (mm)    432358 non-null  object 
 11  Temp. Ar Média (ºC)        431686 non-null  float64
 12  Temp. Ar Máx, (ºC)         431687 non-null  float64
 13  Temp. Ar  Mínima (ºC)      43

In [50]:
df.describe().iloc[0,:]

Dia                          432358.0
Vel, Vento Máx, (Km/h)       432358.0
Vel, Vento Média (Km/h)      432189.0
Vel, Vento Vetor (Km/h)      432096.0
Dir, Vento Média (graus)     432358.0
Dir, Vento Desvio (graus)    432358.0
Temp. Ar Média (ºC)          431686.0
Temp. Ar Máx, (ºC)           431687.0
Temp. Ar  Mínima (ºC)        431687.0
Um. Rel. Média (%)           432357.0
Um. Rel. Máxima (%)          432358.0
Um. Rel. Mínima (%)          432358.0
Name: count, dtype: float64

In [51]:
df.describe().iloc[0,:].max() - df.describe().iloc[0,:].min()

672.0

In [52]:
df.head(15)

Unnamed: 0,Estação,Mês.1,Dia,Hora,Ano,"Vel, Vento Máx, (Km/h)","Vel, Vento Média (Km/h)","Vel, Vento Vetor (Km/h)","Dir, Vento Média (graus)","Dir, Vento Desvio (graus)",Precipitação Total (mm),Temp. Ar Média (ºC),"Temp. Ar Máx, (ºC)",Temp. Ar Mínima (ºC),Um. Rel. Média (%),Um. Rel. Máxima (%),Um. Rel. Mínima (%)
0,Cerro do Roque,Janeiro,1.0,00:00:00,2020,17.7,9.5,9.2,66.3,15.0,0,20.03,20.4,19.76,86.7,88.8,83.9
1,Cerro do Roque,Janeiro,1.0,00:30:00,2020,16.4,9.6,9.2,22.0,14.9,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cerro do Roque,Janeiro,1.0,01:00:00,2020,17.3,11.2,10.8,15.0,14.7,0,20.48,20.98,19.96,81.5,86.6,76.81
3,Cerro do Roque,Janeiro,1.0,01:30:00,2020,17.0,9.0,8.8,4.0,13.6,0,0.0,0.0,0.0,0.0,0.0,0.0
4,Cerro do Roque,Janeiro,1.0,02:00:00,2020,13.6,8.9,8.4,9.0,17.5,0,20.15,20.53,19.82,83.3,85.3,80.9
5,Cerro do Roque,Janeiro,1.0,02:30:00,2020,12.4,6.7,6.4,334.7,17.2,0,0.0,0.0,0.0,0.0,0.0,0.0
6,Cerro do Roque,Janeiro,1.0,03:00:00,2020,10.9,6.3,6.2,323.6,13.2,0,20.24,20.47,20.0,83.7,85.1,81.9
7,Cerro do Roque,Janeiro,1.0,03:30:00,2020,12.0,7.3,7.2,312.9,12.5,0,0.0,0.0,0.0,0.0,0.0,0.0
8,Cerro do Roque,Janeiro,1.0,04:00:00,2020,12.1,8.3,8.0,297.8,15.6,0,20.43,20.81,20.13,83.7,84.8,81.8
9,Cerro do Roque,Janeiro,1.0,04:30:00,2020,11.8,7.9,7.7,286.9,10.2,0,0.0,0.0,0.0,0.0,0.0,0.0


<br> <br>
### Removendo as colunas de direção do vento (graus), temperaturas extremas (máximo e mínimo), velocidade do vento extremas (máximo e mínimo) e umidades relativas extremas (máximo e mínimo).
---

In [53]:
df.drop(columns=["Dir, Vento Média (graus)", "Dir, Vento Desvio (graus)", "Vel, Vento Máx, (Km/h)", "Vel, Vento Vetor (Km/h)", "Temp. Ar  Mínima (ºC)", "Temp. Ar Máx, (ºC)", "Um. Rel. Máxima (%)", "Um. Rel. Mínima (%)"], inplace=True)

<br> <br>
### Removendo todos os registros (instâncias) das horas intermediárias (00:30:00, 01:30:00, 02:30:00, 03:30:00, ..., 23:30:00) pois nesses horários as bases não registraram dados de temperatura, precipitação e umidade relativa.
---

In [54]:
verif = df['Hora'].str.contains('30')
df = df[~verif]
df.reset_index(drop=True, inplace=True)

In [55]:
df

Unnamed: 0,Estação,Mês.1,Dia,Hora,Ano,"Vel, Vento Média (Km/h)",Precipitação Total (mm),Temp. Ar Média (ºC),Um. Rel. Média (%)
0,Cerro do Roque,Janeiro,1.0,00:00:00,2020,9.50,0000,20.03,86.70
1,Cerro do Roque,Janeiro,1.0,01:00:00,2020,11.20,0000,20.48,81.50
2,Cerro do Roque,Janeiro,1.0,02:00:00,2020,8.90,0000,20.15,83.30
3,Cerro do Roque,Janeiro,1.0,03:00:00,2020,6.30,0000,20.24,83.70
4,Cerro do Roque,Janeiro,1.0,04:00:00,2020,8.30,0000,20.43,83.70
...,...,...,...,...,...,...,...,...,...
218547,Aroeira,Dezembro,4.0,12:00:00,2023,15.54,000,18.31,58.94
218548,Aroeira,Dezembro,4.0,13:00:00,2023,15.42,000,18.77,56.25
218549,Aroeira,Dezembro,4.0,14:00:00,2023,14.56,000,19.25,52.84
218550,Aroeira,Dezembro,4.0,15:00:00,2023,13.80,000,19.50,51.88


<br> <br>
# Definindo as Condições para Aplicação
---
### 1. Temperatura Média do Ar (°C):

* Ideal: **30 >= X >= 20**
* Possível: **5 <= X < 20**
* Impróprio: **5 > X > 30**

### 2. Velocidade Média do Vento (km/h):

* Ideal: **X <= 5**
* Possível: **5 < X <= 10**
* Impróprio: **X > 10**

### 3. Umidade Relativa do Ar (%):

* Ideal: **X >= 60**
* Possível: **50 <= X < 60**
* Impróprio: **X < 50**
<br>

In [56]:
def avaliador_temp(registro):
  if registro <= 30 and registro >= 20:
    return "Ideal"
  elif registro >= 5:
    return "Possível"
  else:
    return "Impróprio"

def avaliador_vento(registro):
  if registro <= 5:
    return "Ideal"
  elif registro <= 10:
    return "Possível"
  else:
    return "Impróprio"

def avaliador_umidade(registro):
  if registro >= 60:
    return "Ideal"
  elif registro >= 50:
    return "Possível"
  else:
    return "Impróprio"

def print_full(x):
  pd.set_option('display.max_rows', len(x))
  print(x)
  pd.reset_option('display.max_rows')

In [57]:
df

Unnamed: 0,Estação,Mês.1,Dia,Hora,Ano,"Vel, Vento Média (Km/h)",Precipitação Total (mm),Temp. Ar Média (ºC),Um. Rel. Média (%)
0,Cerro do Roque,Janeiro,1.0,00:00:00,2020,9.50,0000,20.03,86.70
1,Cerro do Roque,Janeiro,1.0,01:00:00,2020,11.20,0000,20.48,81.50
2,Cerro do Roque,Janeiro,1.0,02:00:00,2020,8.90,0000,20.15,83.30
3,Cerro do Roque,Janeiro,1.0,03:00:00,2020,6.30,0000,20.24,83.70
4,Cerro do Roque,Janeiro,1.0,04:00:00,2020,8.30,0000,20.43,83.70
...,...,...,...,...,...,...,...,...,...
218547,Aroeira,Dezembro,4.0,12:00:00,2023,15.54,000,18.31,58.94
218548,Aroeira,Dezembro,4.0,13:00:00,2023,15.42,000,18.77,56.25
218549,Aroeira,Dezembro,4.0,14:00:00,2023,14.56,000,19.25,52.84
218550,Aroeira,Dezembro,4.0,15:00:00,2023,13.80,000,19.50,51.88


In [58]:
df.columns

Index(['Estação', 'Mês.1', 'Dia', 'Hora', 'Ano', 'Vel, Vento Média (Km/h)',
       'Precipitação Total (mm)', 'Temp. Ar Média (ºC)', 'Um. Rel. Média (%)'],
      dtype='object')

In [59]:
df.rename(columns={'Mês.1':'Mês'}, inplace=True)

In [60]:
df

Unnamed: 0,Estação,Mês,Dia,Hora,Ano,"Vel, Vento Média (Km/h)",Precipitação Total (mm),Temp. Ar Média (ºC),Um. Rel. Média (%)
0,Cerro do Roque,Janeiro,1.0,00:00:00,2020,9.50,0000,20.03,86.70
1,Cerro do Roque,Janeiro,1.0,01:00:00,2020,11.20,0000,20.48,81.50
2,Cerro do Roque,Janeiro,1.0,02:00:00,2020,8.90,0000,20.15,83.30
3,Cerro do Roque,Janeiro,1.0,03:00:00,2020,6.30,0000,20.24,83.70
4,Cerro do Roque,Janeiro,1.0,04:00:00,2020,8.30,0000,20.43,83.70
...,...,...,...,...,...,...,...,...,...
218547,Aroeira,Dezembro,4.0,12:00:00,2023,15.54,000,18.31,58.94
218548,Aroeira,Dezembro,4.0,13:00:00,2023,15.42,000,18.77,56.25
218549,Aroeira,Dezembro,4.0,14:00:00,2023,14.56,000,19.25,52.84
218550,Aroeira,Dezembro,4.0,15:00:00,2023,13.80,000,19.50,51.88


In [66]:
df['Precipitação Total (mm)'] = df['Precipitação Total (mm)'].replace(to_replace=',', value='.', regex=True).astype(float)

In [67]:
df.loc[:, 'nota_temperatura'] = df.apply(lambda x: avaliador_temp(x['Temp. Ar Média (ºC)']), axis=1)
df.loc[:, 'nota_umidade'] = df.apply(lambda x: avaliador_temp(x['Um. Rel. Média (%)']), axis=1)
df.loc[:, 'nota_vento'] = df.apply(lambda x: avaliador_temp(x['Vel, Vento Média (Km/h)']), axis=1)
df.loc[:, 'nota_precipitação'] = df.apply(lambda x: 'Impróprio' if x['Precipitação Total (mm)'] > 0 else 'Ideal', axis=1)

In [None]:
df

<br> <br>

## Criando função para dar notas aos dados:


> 1 = Pode operar;<br>
> 0 = Não pode operar.

In [68]:
def avaliador(vento, temperatura, umidade, chuva):
  if (vento != "Impróprio" and temperatura != "Impróprio" and umidade != "Impróprio" and chuva == 0):
    return 1
  else:
    return 0

In [69]:
df.loc[:, 'Nota'] = df.apply(lambda x: avaliador(x['nota_vento'],x['nota_temperatura'],
                                                 x['nota_umidade'],
                                                 x['Precipitação Total (mm)']), axis=1)

In [70]:
df.Nota.unique()

array([1, 0], dtype=int64)

In [None]:
df_only1 = df[df['Nota'] == 1]

In [None]:
df_only1