In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
%matplotlib inline



In [2]:
df = pd.read_csv('../../../datasets/parte1/weatherAUS.csv')

#### Split Data

In [3]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d", utc=True)
df['Month'] = df['Date'].dt.month
df.drop(['Date'], inplace=True, axis=1)

df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5


### Drop columns which do not improve the model

In [4]:
#df.drop(['Location'], axis=1, inplace=True)

## Missing Values


#### Mean imputation in NA values

In [None]:
#df.drop(['Evaporation'], axis=1, inplace=True)
#df.drop(['Sunshine'], axis=1, inplace=True)
#df.drop(['Cloud9am'], axis=1, inplace=True)
#df.drop(['Cloud3pm'], axis=1, inplace=True)

In [6]:
# Numeric features
df['MinTemp'].fillna(df['MinTemp'].mean(), inplace=True)

df['MaxTemp'].fillna(df['MaxTemp'].mean() , inplace=True)

df['Rainfall'].fillna(df['Rainfall'].mean() , inplace=True)

df['WindGustSpeed'].fillna(df['WindGustSpeed'].mean() , inplace=True)

df['WindSpeed9am'].fillna(df['WindSpeed9am'].mean() , inplace=True)

df['WindSpeed3pm'].fillna(df['WindSpeed3pm'].mean() , inplace=True)

df['Humidity9am'].fillna(df['Humidity9am'].mean() , inplace=True)

df['Humidity3pm'].fillna(df['Humidity3pm'].mean() , inplace=True)

df['Pressure9am'].fillna(df['Pressure9am'].mean() , inplace=True)

df['Pressure3pm'].fillna(df['Pressure3pm'].mean() , inplace=True)

df['Temp9am'].fillna(df['Temp9am'].mean() , inplace=True)

df['Temp3pm'].fillna(df['Temp3pm'].mean() , inplace=True)

df['Evaporation'].fillna(df['Evaporation'].mean() , inplace=True)

df['Sunshine'].fillna(df['Sunshine'].mean() , inplace=True)

df['Cloud9am'].fillna(df['Cloud9am'].mean() , inplace=True)

df['Cloud3pm'].fillna(df['Cloud3pm'].mean(), inplace=True)

# Categorical features
# FAZEMOS ISTO PORQUE AS FEATURES CATEGORIAS ESTAO BEM DISTRIBUIDAS E PORTANTO A MODA NAO REPRESENTA UMA GRANDE PARTE DOS DADOS
categorical_features = [col for col in df.columns if df[col].dtype == object]
categorical_features.remove('RainTomorrow')
categorical_features.remove('RainToday')
categorical_features.remove('Location')

for col in categorical_features:
    values = df['WindDir3pm'].value_counts().index.values
    probs = df[col].value_counts(normalize=True).values
    df[col].replace(np.nan, np.random.choice(a=values, p=probs), inplace=True)

In [7]:
df.dropna(inplace=True)

print(df.isna().sum())

df.shape

MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
Year             0
Month            0
Day              0
dtype: int64


(123710, 20)

## Label encoding

In [8]:
label_encoder = LabelEncoder()

df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

## Weighted Loss

In [9]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['RainTomorrow']), y=df['RainTomorrow'])
class_weights_dict = {class_label: weight for class_label, weight in zip(np.unique(df['RainTomorrow']), class_weights)}

# Add a new column 'weight' to the DataFrame
df['weight'] = df['RainTomorrow'].map(class_weights_dict)

In [10]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,Temp3pm,Year,Month,Day,EncodedWindGustDir,EncodedWindDir9am,EncodedWindDir3pm,EncodedRainToday,EncodedRainTomorrow,weight
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,21.8,2008,12,1,13,13,14,0,0,0.642196
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,24.3,2008,12,2,14,6,15,0,0,0.642196
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,23.2,2008,12,3,15,13,15,0,0,0.642196
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,26.5,2008,12,4,4,9,0,0,0,0.642196
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,29.7,2008,12,5,13,1,7,0,0,0.642196


In [11]:
df.to_csv('../../../datasets/parte1/dataset_cleaned.csv', index=False)
