In [118]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
%matplotlib inline

In [119]:
df = pd.read_csv('../../../datasets/parte1/weatherAUS.csv')

#### Split Data

In [120]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop(['Date'], inplace=True, axis=1)

df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5


# Missing Values


#### Mean imputation in NA values

In [121]:
df.drop(['Evaporation'], axis=1, inplace=True)
df.drop(['Sunshine'], axis=1, inplace=True)
df.drop(['Cloud9am'], axis=1, inplace=True)
df.drop(['Cloud3pm'], axis=1, inplace=True)

In [122]:
mintemp_mean=df['MinTemp'].mean() 
df['MinTemp'].fillna(mintemp_mean, inplace=True)

mintemp_mean=df['MinTemp'].mean() 
df['MinTemp'].fillna(mintemp_mean, inplace=True)

maxtemp_mean=df['MaxTemp'].mean() 
df['MaxTemp'].fillna(maxtemp_mean, inplace=True)

rainfall_mean=df['Rainfall'].mean() 
df['Rainfall'].fillna(rainfall_mean, inplace=True)

windgustspeed_mean=df['WindGustSpeed'].mean() 
df['WindGustSpeed'].fillna(windgustspeed_mean, inplace=True)

windspeed9am_mean=df['WindSpeed9am'].mean() 
df['WindSpeed9am'].fillna(windspeed9am_mean, inplace=True)

windspeed3pm_mean=df['WindSpeed3pm'].mean() 
df['WindSpeed3pm'].fillna(windspeed3pm_mean, inplace=True)

humidity9am_mean=df['Humidity9am'].mean() 
df['Humidity9am'].fillna(humidity9am_mean, inplace=True)

humidity3pm_mean=df['Humidity3pm'].mean() 
df['Humidity3pm'].fillna(humidity3pm_mean, inplace=True)

pressure9am_mean=df['Pressure9am'].mean() 
df['Pressure9am'].fillna(pressure9am_mean, inplace=True)

pressure3pm_mean=df['Pressure3pm'].mean() 
df['Pressure3pm'].fillna(pressure3pm_mean, inplace=True)

temp9am_mean=df['Temp9am'].mean() 
df['Temp9am'].fillna(temp9am_mean, inplace=True)

temp3pm_mean=df['Temp3pm'].mean() 
df['Temp3pm'].fillna(temp3pm_mean, inplace=True)
 

In [123]:
df.dropna(inplace=True)

print(df.isna().sum())

df.shape

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
Year             0
Month            0
Day              0
dtype: int64


(123710, 21)

## One-Hot encoding

In [124]:
# List of columns to encode
columns_to_encode = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
aux_df = []

lb = LabelBinarizer()
le = LabelEncoder()

for col in columns_to_encode:
    lb_results = lb.fit_transform(df[col])
    lb_encoded = pd.DataFrame(lb_results, columns=[f'{col}_{label}' for label in lb.classes_])
    aux_df.append(lb_encoded)
    df.drop(col, axis=1, inplace=True)

# Encode 'RainToday' and 'RainTomorrow' using LabelEncoder
df['EncodedRainToday'] = le.fit_transform(df['RainToday'])
df['EncodedRainTomorrow'] = le.fit_transform(df['RainTomorrow'])
df.drop(['RainToday', 'RainTomorrow'], axis=1, inplace=True)

for i, lb_encoded in enumerate(aux_df):
    lb_encoded.reset_index(drop=True, inplace=True)
    aux_df[i] = lb_encoded

df.reset_index(drop=True, inplace=True)

df = pd.concat([df] + aux_df, axis=1)

In [127]:
df.head()
print(df.isna().sum())

MinTemp           0
MaxTemp           0
Rainfall          0
WindGustSpeed     0
WindSpeed9am      0
                 ..
WindDir3pm_SSW    0
WindDir3pm_SW     0
WindDir3pm_W      0
WindDir3pm_WNW    0
WindDir3pm_WSW    0
Length: 112, dtype: int64


(123710, 112)

In [126]:
df.to_csv('../../../datasets/parte1/dataset_cleaned.csv', index=False)