In [23]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
%matplotlib inline

In [24]:
df = pd.read_csv('../../../datasets/parte1/weatherAUS.csv')

## Missing Values

In [25]:
df_api = pd.read_csv('../../../datasets/parte1/api.csv')

In [26]:
# Merge API daily based on Location and Date using left merge
merged_df = pd.merge(df, df_api, on=['Location', 'Date'], how='left', suffixes=('', '_df2'))

for col in df.columns:
    if col in df_api and col not in ["Location", "Date"]:
        merged_df[col].fillna(merged_df[col + '_df2'], inplace=True)

# Drop the columns ending with '_df2'
merged_df.drop(columns=merged_df.filter(like='_df2').columns, inplace=True)

In [27]:
print(merged_df.isna().sum())

Date                0
Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
Evaporation         0
Sunshine            0
WindGustDir         0
WindGustSpeed       0
WindDir9am          0
WindDir3pm          0
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday        3261
RainTomorrow     3267
dtype: int64


In [28]:
merged_df.dropna(inplace=True)

In [29]:
df = merged_df

#### Split Data

In [30]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Month'] = df['Date'].dt.month
df.drop(['Date'], inplace=True, axis=1)

df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,Albury,13.4,22.9,0.6,5.55,12.8,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,0.9,16.9,21.8,No,No,12
1,Albury,7.4,25.1,0.0,5.99,13.5,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,1.0,3.0,17.2,24.3,No,No,12
2,Albury,12.9,25.7,0.0,5.7,13.5,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,0.8,2.0,21.0,23.2,No,No,12
3,Albury,9.2,28.0,0.0,6.17,11.8,NE,24.0,SE,E,...,16.0,1017.6,1012.8,0.0,2.4,18.1,26.5,No,No,12
4,Albury,17.5,32.3,1.0,6.17,13.1,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,12


## Label encoding

In [31]:
label_encoder = LabelEncoder()

df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

In [32]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,2,13.4,22.9,0.6,5.55,12.8,13,44.0,13,14,...,22.0,1007.7,1007.1,8.0,0.9,16.9,21.8,0,0,12
1,2,7.4,25.1,0.0,5.99,13.5,14,44.0,6,15,...,25.0,1010.6,1007.8,1.0,3.0,17.2,24.3,0,0,12
2,2,12.9,25.7,0.0,5.7,13.5,15,46.0,13,15,...,30.0,1007.6,1008.7,0.8,2.0,21.0,23.2,0,0,12
3,2,9.2,28.0,0.0,6.17,11.8,4,24.0,9,0,...,16.0,1017.6,1012.8,0.0,2.4,18.1,26.5,0,0,12
4,2,17.5,32.3,1.0,6.17,13.1,13,41.0,1,7,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0,12


In [33]:
df.to_csv('../../../datasets/parte1/dataset_cleaned.csv', index=False)