# Data Preparation 4

- Split date feature
- API values imputation for missing values
- Label encoding
- Undersampling in target feature

#### Import Libraries

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
%matplotlib inline

#### Read Data

In [6]:
df = pd.read_csv('../../../../datasets/parte1/weatherAUS.csv')
df_api = pd.read_csv('../../../../datasets/parte1/api.csv')

#### Handle Missing Values

##### Merge API data by Date and Location

In [7]:
# Merge API data based on Location and Date using left merge
merged_df = pd.merge(df, df_api, on=['Location', 'Date'], how='left', suffixes=('', '_df2'))

for col in df.columns:
    if col in df_api and col not in ["Location", "Date"]:
        merged_df[col].fillna(merged_df[col + '_df2'], inplace=True)

# Drop the columns ending with '_df2'
merged_df.drop(columns=merged_df.filter(like='_df2').columns, inplace=True)

df = merged_df

##### Drop other categorical columns missing values

In [8]:
df.dropna(inplace=True)

#### Split Date Feature

Instead of using the full date, extracting just the month is much more valuable due to rain seasonality

In [9]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Month'] = df['Date'].dt.month
df.drop(['Date'], inplace=True, axis=1)

#### Label Encoding

Converts categorical data into numerical format

In [10]:
label_encoder = LabelEncoder()

df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

#### Under Sampling

- Balances the label distribution by reducing the number of instances in the majority class ("No")
- Random instances removal

In [11]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

undersampler = RandomUnderSampler(random_state=2023)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['RainTomorrow'] = y_resampled

#### Write Prepared Data

In [12]:
df_resampled.to_csv('../../../../datasets/parte1/dataset_cleaned.csv', index=False)
