# IMPORTAÇÃO DAS BIBLIOTECAS

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

from imblearn import over_sampling

import warnings
warnings.filterwarnings('ignore')

# IMPORTAÇÃO DOS DADOS

In [8]:
weather = pd.read_csv('https://raw.githubusercontent.com/guilourenzo/master_degree_sin5007/refs/heads/main/DATA/weatherAUS.csv')
weather = weather.sort_values(by=['Date', 'Location'], ascending=True).drop(columns= ['RainToday', 'Pressure9am', 'Temp9am', 'Temp3pm', 'WindDir9am', 'Location']).dropna()
weather.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,Cloud9am,Cloud3pm,RainTomorrow
45587,2007-11-01,8.0,24.3,0.0,3.4,6.3,NW,30.0,NW,6.0,20.0,68.0,29.0,1015.0,7.0,7.0,Yes
45588,2007-11-02,14.0,26.9,3.6,4.4,9.7,ENE,39.0,W,4.0,17.0,80.0,36.0,1008.4,5.0,3.0,Yes
45589,2007-11-03,13.7,23.4,3.6,5.8,3.3,NW,85.0,NNE,6.0,6.0,82.0,69.0,1007.2,8.0,7.0,Yes
45590,2007-11-04,13.3,15.5,39.8,7.2,9.1,NW,54.0,W,30.0,24.0,62.0,56.0,1007.0,2.0,7.0,Yes
45591,2007-11-05,7.6,16.1,2.8,5.6,10.6,SSE,50.0,ESE,20.0,28.0,68.0,49.0,1018.5,7.0,7.0,No


# PROCESSAMENTO DE DADOS

## FORMATO DE DADOS

In [None]:
weather.loc[:, 'Date'] = pd.to_datetime(weather.Date, yearfirst=True)
weather.loc[:, 'RainTomorrow'] = weather.RainTomorrow.map({'Yes': 1, 'No': 0})

## VARIÁVEIS CATEGÓRICAS

In [9]:
categorical_columns = weather.select_dtypes(include=["object", "datetime64"])
numerical_columns = weather.select_dtypes('number')

In [10]:
print('WindGustDir contains', len(categorical_columns['WindGustDir'].unique()), 'labels')
print('WindDir3pm contains', len(categorical_columns['WindDir3pm'].unique()), 'labels')

WindGustDir contains 16 labels
WindDir3pm contains 16 labels


### DUMMIES

In [14]:
categorical_encoded = pd.get_dummies(categorical_columns, columns=['WindGustDir', 'WindDir3pm'], drop_first=True, prefix=['WindGustDir', 'WindDir3pm'], dtype=int)

## VARIÁVEIS NUMÉRICAS

In [15]:
numerical_columns['RangeTemp'] = numerical_columns['MaxTemp'] - numerical_columns['MinTemp']

In [16]:
discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="kmeans", random_state=42)
numerical_columns['Rainfall_Discretized'] = discretizer.fit_transform(numerical_columns['Rainfall'].values.reshape(-1, 1))
numerical_columns.groupby('Rainfall_Discretized')['Rainfall'].agg(['min', 'max']).reset_index()

Unnamed: 0,Rainfall_Discretized,min,max
0,0.0,0.0,4.4
1,1.0,4.5,15.2
2,2.0,15.4,33.6
3,3.0,33.8,71.8
4,4.0,74.6,206.2


## VARIÁVEIS TEMPORAIS

In [31]:
weather.Date.dt.month

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
weather['month'] = weather.Date.dt.month

AttributeError: 'Series' object has no attribute 'month'