# Data Preparation 4

- Split date feature
- Mean/median imputation for numeric features
- Random value imputation for most of categorical features
- Label encoding
- Oversampling in target feature

#### Import Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
%matplotlib inline

#### Read Data

In [None]:
df = pd.read_csv('../../../../datasets/parte1/weatherAUS.csv')

#### Split Date Feature

Instead of using the full date, extracting just the month is much more valuable due to rain seasonality

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Month'] = df['Date'].dt.month
df.drop(['Date'], inplace=True, axis=1)

#### Handle Missing Values

##### Mean/Median imputation in numeric features

- Median imputation for features with skewed distribution due to its significant percentage of outliers
- Mean imputation for the reamining ones as the mean is sensitive to extreme values, being recommended for data with less outliers

In [None]:
df['WindSpeed9am'].fillna(df['WindSpeed9am'].median() , inplace=True)
df['WindSpeed3pm'].fillna(df['WindSpeed3pm'].median() , inplace=True)
df['Rainfall'].fillna(df['Rainfall'].median() , inplace=True)
df['Evaporation'].fillna(df['Evaporation'].median() , inplace=True)
df['WindGustSpeed'].fillna(df['WindGustSpeed'].mean() , inplace=True)
df['MinTemp'].fillna(df['MinTemp'].mean(), inplace=True)
df['MaxTemp'].fillna(df['MaxTemp'].mean() , inplace=True)
df['Sunshine'].fillna(df['Sunshine'].mean() , inplace=True)
df['Humidity9am'].fillna(df['Humidity9am'].mean() , inplace=True)
df['Humidity3pm'].fillna(df['Humidity3pm'].mean() , inplace=True)
df['Pressure9am'].fillna(df['Pressure9am'].mean() , inplace=True)
df['Pressure3pm'].fillna(df['Pressure3pm'].mean() , inplace=True)
df['Temp9am'].fillna(df['Temp9am'].mean() , inplace=True)
df['Temp3pm'].fillna(df['Temp3pm'].mean() , inplace=True)
df['Cloud9am'].fillna(df['Cloud9am'].mean() , inplace=True)
df['Cloud3pm'].fillna(df['Cloud3pm'].mean(), inplace=True)

##### Random value for direction features

- Replace missing values with random values drawn from the distribution of observed values in the variable
- Provides a more realistic approach compared to simple imputation methods because ensures that the overall distribution of the categorical variable is preserved, maintaining its statistical properties

In [None]:
# FAZEMOS ISTO PORQUE AS FEATURES CATEGORIAS ESTAO BEM DISTRIBUIDAS E PORTANTO A MODA NAO REPRESENTA UMA GRANDE PARTE DOS DADOS
categorical_features = [col for col in df.columns if df[col].dtype == object]
categorical_features.remove('RainTomorrow')
categorical_features.remove('RainToday')
categorical_features.remove('Location')

for col in categorical_features:
    values = df['WindDir3pm'].value_counts().index.values
    probs = df[col].value_counts(normalize=True).values
    df[col].replace(np.nan, np.random.choice(a=values, p=probs), inplace=True)

##### Drop other categorical columns missing values

In [None]:
df.dropna(inplace=True)

#### Label Encoding

Converts categorical data into numerical format

In [None]:
label_encoder = LabelEncoder()

df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

#### Over Sampling

- Balances the label distribution by increasing the number of instances in the minority class ("Yes")
- Synthetic samples generation

In [None]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

smote = SMOTE(random_state=2023)
X_resampled, y_resampled = smote.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['RainTomorrow'] = y_resampled

#### Write Prepared Data

In [None]:
df_resampled.to_csv('../../../../datasets/parte1/dataset_cleaned.csv', index=False)