## This notebook performs the data preparation for the common classifier dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
# This is a kaggle dataset - https://www.kaggle.com/jsphyg/weather-dataset-rattle-package
df = pd.read_csv('../../data/weatherAUS.csv')
# print(df)

In [3]:
df.shape

(145460, 23)

In [4]:
df.head(10)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
5,2008-12-06,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,No
6,2008-12-07,Albury,14.3,25.0,0.0,,,W,50.0,SW,...,49.0,19.0,1009.6,1008.2,1.0,,18.1,24.6,No,No
7,2008-12-08,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,No
8,2008-12-09,Albury,9.7,31.9,0.0,,,NNW,80.0,SE,...,42.0,9.0,1008.9,1003.6,,,18.3,30.2,No,Yes
9,2008-12-10,Albury,13.1,30.1,1.4,,,W,28.0,S,...,58.0,27.0,1007.0,1005.7,,,20.1,28.2,Yes,No


In [5]:
(df.isnull().sum()/len(df))*100

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

In [6]:
# these columns have large numbers of missing values. We'll drop those
df.drop(columns=['Evaporation','Sunshine','Cloud9am','Cloud3pm'], inplace=True)

In [7]:
# these are important columns, if any empty values, the rows are dropped
df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

In [8]:
# this is dropped as well, we don't want the one-hot encoded data
df.drop(columns=['Location'], inplace=True)

In [9]:
# Fill in numeric columns with missing values with the mean
def fill_missing_numeric_values(data):
    for col in data.select_dtypes(['int','float']):
        data[col] = data[col].fillna(data[col].mean())
    return data

# for other datatypes, just forward fill
def fill_missing_object_values(data):
    for col in data.select_dtypes(['object']):
        data[col] = data[col].fillna(method='ffill')
    return data

df = fill_missing_numeric_values(df)
df = fill_missing_object_values(df)

In [10]:
# the month is an intersting data point, we'll keep that, and drop the date
df['Date'] = pd.to_datetime(df['Date'])
month = df['Date'].dt.month
df.drop(columns=['Date'], inplace=True)
df.insert(loc=0, column='Month', value=month)

In [11]:
# One-hot encode these categorical features, not required for a decision tree model
df = pd.get_dummies(df, columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [12]:
# Scale these features
col_names = ['Month', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
scaled_features = df[col_names]
# scaler = StandardScaler().fit(scaled_features.values)
scaler = MinMaxScaler().fit(scaled_features.values)
scaled_features = scaler.transform(scaled_features.values)
df[col_names] = scaled_features

In [13]:
# change Yes/No to 1/0, again, not required for a decision tree model
def replace_with_zero_or_one(x):
    if x[0] == 'No':
        return 0
    else:
        return 1
    
# df['RainTomorrow'] = df[['RainTomorrow']].apply(replace_with_zero_or_one, axis=1)
df['RainToday'] = df[['RainToday']].apply(replace_with_zero_or_one, axis=1)

In [14]:
# for decision tree model, this can be skipped
df['RainTomorrowN'] = df['RainTomorrow']
df.drop(columns=['RainTomorrow'], inplace=True)

In [15]:
df.head(10)

Unnamed: 0,Month,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainTomorrowN
0,1.0,0.516509,0.523629,0.001617,0.294574,0.153846,0.275862,0.71,0.22,0.449587,...,0,0,0,0,0,0,0,1,0,No
1,1.0,0.375,0.565217,0.0,0.294574,0.030769,0.252874,0.44,0.25,0.497521,...,0,0,0,0,0,0,0,0,1,No
2,1.0,0.504717,0.57656,0.0,0.310078,0.146154,0.298851,0.38,0.3,0.447934,...,0,0,0,0,0,0,0,0,1,No
3,1.0,0.417453,0.620038,0.0,0.139535,0.084615,0.103448,0.45,0.16,0.613223,...,0,0,0,0,0,0,0,0,0,No
4,1.0,0.613208,0.701323,0.002695,0.271318,0.053846,0.229885,0.82,0.33,0.500826,...,1,0,0,0,0,0,0,0,0,No
5,1.0,0.544811,0.652174,0.000539,0.387597,0.146154,0.275862,0.55,0.23,0.47438,...,0,0,0,0,0,0,1,0,0,No
6,1.0,0.537736,0.563327,0.0,0.341085,0.153846,0.275862,0.49,0.19,0.480992,...,0,0,0,0,0,0,1,0,0,No
7,1.0,0.382075,0.595463,0.0,0.224806,0.046154,0.195402,0.48,0.19,0.543802,...,0,0,0,0,0,0,1,0,0,No
8,1.0,0.429245,0.693762,0.0,0.573643,0.053846,0.321839,0.42,0.09,0.469421,...,1,0,0,0,0,0,0,0,0,Yes
9,1.0,0.509434,0.659735,0.003774,0.170543,0.115385,0.126437,0.58,0.27,0.438017,...,0,0,0,1,0,0,0,0,0,No


In [16]:
df.shape

(140787, 63)

In [17]:
df.columns

Index(['Month', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
       'WindGustDir_E', 'WindGustDir_ENE', 'WindGustDir_ESE', 'WindGustDir_N',
       'WindGustDir_NE', 'WindGustDir_NNE', 'WindGustDir_NNW',
       'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE', 'WindGustDir_SSE',
       'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W', 'WindGustDir_WNW',
       'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE', 'WindDir9am_ESE',
       'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE', 'WindDir9am_NNW',
       'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE', 'WindDir9am_SSE',
       'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W', 'WindDir9am_WNW',
       'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE', 'WindDir3pm_ESE',
       'WindDir3pm_N', 'WindDir3pm_NE', 'WindDir3pm_NNE', 'WindDir3pm_NNW',
       'WindDir3pm_NW', 'WindDir3

In [18]:
print(len(df.columns))

63


In [20]:
df.to_csv('../../data/cleanedWeatherAUS.csv', index=False, header=False)