In [66]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
%matplotlib inline

In [67]:
df = pd.read_csv('../../../datasets/parte1/weatherAUS.csv')

## Missing Values

In [68]:
df_api = pd.read_csv('../../../datasets/parte1/api.csv')

In [69]:
# Merge API daily based on Location and Date using left merge
merged_df = pd.merge(df, df_api, on=['Location', 'Date'], how='left', suffixes=('', '_df2'))

for col in df.columns:
    if col in df_api and col not in ["Location", "Date"]:
        merged_df[col].fillna(merged_df[col + '_df2'], inplace=True)

# Drop the columns ending with '_df2'
merged_df.drop(columns=merged_df.filter(like='_df2').columns, inplace=True)

In [70]:
print(merged_df.isna().sum())

Date                0
Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
Evaporation         0
Sunshine            0
WindGustDir         0
WindGustSpeed       0
WindDir9am          0
WindDir3pm          0
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday        3261
RainTomorrow     3267
dtype: int64


#### Split Data

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Month'] = df['Date'].dt.month
df.drop(['Date'], inplace=True, axis=1)

df.head()

## Label encoding

In [None]:
label_encoder = LabelEncoder()

df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

## Weighted Loss

In [None]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['RainTomorrow']), y=df['RainTomorrow'])
class_weights_dict = {class_label: weight for class_label, weight in zip(np.unique(df['RainTomorrow']), class_weights)}

# Add a new column 'weight' to the DataFrame
df['weight'] = df['RainTomorrow'].map(class_weights_dict)

In [None]:
df.head()

In [None]:
df.to_csv('../../../datasets/parte1/dataset_cleaned.csv', index=False)


In [None]:
raintomorrow_count = df['RainTomorrow'].value_counts()
sns.set_style('darkgrid')
sns.barplot(x=raintomorrow_count.index, y=raintomorrow_count.values)
plt.title('Frequency Distribution of RainTomorrow')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('RainTomorrow', fontsize=12)
plt.show()