# Binary Classification

In [1]:
# Notebook setup

# Imports
import math

import inflection
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as subplots
from sklearn import ensemble, metrics, model_selection, preprocessing
from xgboost import XGBClassifier

# Set the random seed for reproducability
np.random.seed(1337)

### Binary classification on the rain in Australia data set

In [2]:
# Load the data set and do some very minimal feature engineering

d = pd.read_csv("../data/rain_in_australia/weatherAUS.csv", parse_dates=["Date"])

d.columns = [inflection.underscore(c) for c in d.columns]

d.rain_today = d.rain_today.map({"No": 0, "Yes": 1})
d.rain_tomorrow = d.rain_tomorrow.map({"No": 0, "Yes": 1})

print(d.info())
display(d.describe().T)
display(d.sample(10).T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             145460 non-null  datetime64[ns]
 1   location         145460 non-null  object        
 2   min_temp         143975 non-null  float64       
 3   max_temp         144199 non-null  float64       
 4   rainfall         142199 non-null  float64       
 5   evaporation      82670 non-null   float64       
 6   sunshine         75625 non-null   float64       
 7   wind_gust_dir    135134 non-null  object        
 8   wind_gust_speed  135197 non-null  float64       
 9   wind_dir9am      134894 non-null  object        
 10  wind_dir3pm      141232 non-null  object        
 11  wind_speed9am    143693 non-null  float64       
 12  wind_speed3pm    142398 non-null  float64       
 13  humidity9am      142806 non-null  float64       
 14  humidity3pm      140

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,145460.0,2013-04-04 21:08:51.907053568,2007-11-01 00:00:00,2011-01-11 00:00:00,2013-06-02 00:00:00,2015-06-14 00:00:00,2017-06-25 00:00:00,
min_temp,143975.0,12.194034,-8.5,7.6,12.0,16.9,33.9,6.398495
max_temp,144199.0,23.221348,-4.8,17.9,22.6,28.2,48.1,7.119049
rainfall,142199.0,2.360918,0.0,0.0,0.0,0.8,371.0,8.47806
evaporation,82670.0,5.468232,0.0,2.6,4.8,7.4,145.0,4.193704
sunshine,75625.0,7.611178,0.0,4.8,8.4,10.6,14.5,3.785483
wind_gust_speed,135197.0,40.03523,6.0,31.0,39.0,48.0,135.0,13.607062
wind_speed9am,143693.0,14.043426,0.0,7.0,13.0,19.0,130.0,8.915375
wind_speed3pm,142398.0,18.662657,0.0,13.0,19.0,24.0,87.0,8.8098
humidity9am,142806.0,68.880831,0.0,57.0,70.0,83.0,100.0,19.029164


Unnamed: 0,99369,133244,143255,25238,11675,137088,30732,129294,125697,82711
date,2017-02-02 00:00:00,2009-07-02 00:00:00,2015-10-08 00:00:00,2012-01-16 00:00:00,2016-05-30 00:00:00,2011-10-14 00:00:00,2009-08-10 00:00:00,2015-12-30 00:00:00,2014-05-18 00:00:00,2013-12-08 00:00:00
location,Adelaide,Launceston,Katherine,Penrith,CoffsHarbour,AliceSprings,Sydney,Walpole,SalmonGums,Dartmoor
min_temp,14.9,4.1,23.5,17.1,4.1,22.5,6.8,15.1,4.9,14.7
max_temp,25.8,11.3,36.5,26.9,19.5,37.6,16.3,18.1,23.5,27.4
rainfall,0.0,8.6,0.0,23.2,0.0,0.0,0.0,1.6,0.0,0.0
evaporation,,,9.8,,,15.0,1.8,,,9.6
sunshine,,,,,,11.5,4.1,,,5.3
wind_gust_dir,,NNW,ENE,ENE,S,S,,SE,WNW,N
wind_gust_speed,,30.0,44.0,31.0,20.0,52.0,,39.0,37.0,48.0
wind_dir9am,SW,NW,ENE,SSE,WSW,WSW,WNW,SSE,NW,N


In [3]:
# Distribution of a selected set of numeric variables

d_plot = d.sample(2000)
col_names = d.columns
n_plots = len(col_names)
n_plot_cols = 3
n_plot_rows = math.ceil(n_plots / n_plot_cols)

fig = subplots.make_subplots(
    rows=n_plot_rows,
    cols=n_plot_cols,
    subplot_titles=[c.capitalize() for c in col_names],
    horizontal_spacing=0.1,
)
for i, val_name in enumerate(col_names):
    fig.add_trace(
        go.Histogram(
            x=None if d_plot[val_name].dtype == "object" else d_plot[val_name],
            y=d_plot[val_name] if d_plot[val_name].dtype == "object" else None,
            name="",
            nbinsx=None,
            xbins=dict(size=None, start=None, end=None),
            marker=dict(color="cornflowerblue"),
        ),
        row=1 + i // n_plot_cols,
        col=1 + i % n_plot_cols,
    )
    if d_plot[val_name].dtype == "object":
        fig.update_xaxes(
            title_text="Count", row=1 + i // n_plot_cols, col=1 + i % n_plot_cols
        )
    else:
        fig.update_yaxes(
            title_text="Count", row=1 + i // n_plot_cols, col=1 + i % n_plot_cols
        )
if d_plot[val_name].dtype == "object":
    fig.update_traces(hovertemplate="value: %{y}<br>count: %{x}")
else:
    fig.update_traces(hovertemplate="value: %{x}<br>count: %{y}")
fig.update_layout(
    template="plotly_white",
    width=1200,
    height=80 + 250 * n_plot_rows,
    title="Distributions",
    showlegend=False,
)

In [4]:
print(d.location.value_counts())

location
Canberra            3436
Sydney              3344
Darwin              3193
Melbourne           3193
Brisbane            3193
Adelaide            3193
Perth               3193
Hobart              3193
Albany              3040
MountGambier        3040
Ballarat            3040
Townsville          3040
GoldCoast           3040
Cairns              3040
Launceston          3040
AliceSprings        3040
Bendigo             3040
Albury              3040
MountGinini         3040
Wollongong          3040
Newcastle           3039
Tuggeranong         3039
Penrith             3039
Woomera             3009
Nuriootpa           3009
Cobar               3009
CoffsHarbour        3009
Moree               3009
Sale                3009
PerthAirport        3009
PearceRAAF          3009
Witchcliffe         3009
BadgerysCreek       3009
Mildura             3009
NorfolkIsland       3009
MelbourneAirport    3009
Richmond            3009
SydneyAirport       3009
WaggaWagga          3009
Williamtown     

In [5]:
X = d.copy()
del X["date"]
del X["location"]
X = pd.get_dummies(X, columns=['wind_gust_dir', 'wind_dir9am', 'wind_dir3pm'])
X.dropna(subset=['rain_tomorrow'], inplace=True)
y = X.rain_tomorrow.copy()
del X["rain_tomorrow"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [6]:
rf_res = []
for n_estimators in [10, 100]:
    for max_depth in [2, 4, 8]:
        model = ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth, n_jobs=4
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        rf_res.append(
            {
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "accuracy": metrics.accuracy_score(y_test, y_pred),
                "precision": metrics.precision_score(y_test, y_pred),
                "recall": metrics.recall_score(y_test, y_pred),
                "F1": metrics.f1_score(y_test, y_pred),
                "AUC": metrics.roc_auc_score(y_test, y_pred_proba),
            }
        )
rf_res = pd.DataFrame(rf_res).sort_values("AUC", ascending=False)
display(rf_res)

Unnamed: 0,n_estimators,max_depth,accuracy,precision,recall,F1,AUC
5,100,8,0.839903,0.777884,0.38684,0.516718,0.85813
2,10,8,0.837653,0.765957,0.383344,0.510963,0.849932
4,100,4,0.819614,0.826772,0.23363,0.364312,0.842672
1,10,4,0.820528,0.794935,0.25445,0.385504,0.837233
3,100,2,0.779985,0.926829,0.006039,0.012001,0.826107
0,10,2,0.803333,0.810115,0.145105,0.246125,0.808607


In [7]:
xgb_res = []
for n_estimators in [10, 100]:
    for max_depth in [2, 4, 8]:
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=4)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        xgb_res.append(
            {
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "accuracy": metrics.accuracy_score(y_test, y_pred),
                "precision": metrics.precision_score(y_test, y_pred),
                "recall": metrics.recall_score(y_test, y_pred),
                "F1": metrics.f1_score(y_test, y_pred),
                "AUC": metrics.roc_auc_score(y_test, y_pred_proba),
            }
        )
xgb_res = pd.DataFrame(xgb_res).sort_values("AUC", ascending=False)
display(xgb_res)

Unnamed: 0,n_estimators,max_depth,accuracy,precision,recall,F1,AUC
5,100,8,0.857484,0.729171,0.566116,0.63738,0.88591
4,100,4,0.854109,0.73431,0.533694,0.618132,0.885259
2,10,8,0.85119,0.735267,0.511602,0.603374,0.877732
3,100,2,0.849713,0.731635,0.506516,0.59861,0.8752
1,10,4,0.845564,0.736436,0.47028,0.574006,0.865149
0,10,2,0.839235,0.750291,0.409727,0.530016,0.854122
