In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    validation_curve,
    learning_curve,
    train_test_split
)

figure_export_path = './figures/'

np.random.seed(0)

# Read in Car Data

In [4]:
df = pd.read_csv("../datasets/weatherAUS.csv")

In [5]:
df_larger_count = df.drop(["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"], axis=1)
df_larger_count.count()

df_full = df_larger_count.dropna()
df_full.count()

df_full["day_month_date"] = df_full["Date"].str.slice(start=5)
df_full["day_month_date"].value_counts().count()

df_full["RainToday"].value_counts().count()

rain_df = df_full[
    [
        "day_month_date",
        "Location",
        
        "MinTemp",
        "MaxTemp",
        "Rainfall",
        
        "WindGustDir",
        "WindGustSpeed",
        
        "WindDir9am",
        "WindDir3pm",
        
        "WindSpeed9am",
        "WindSpeed3pm",
        
        "Humidity9am",
        "Humidity3pm",
        
        "Pressure9am",
        "Pressure3pm",
        
        "Temp9am",
        "Temp3pm",
        
        "RainToday",
        "RainTomorrow",
    ]
]
rain_df.info()

dummy_col = [
    "day_month_date",
    "Location",
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",
    "RainToday",
]

dummies = pd.get_dummies(rain_df[dummy_col])

X = rain_df.drop(["RainTomorrow"], axis=1)
# Drop Dummy Col
X = X.drop(dummy_col, axis=1)
# concat dumiies
X = pd.concat([X, dummies], axis=1)

y = rain_df["RainTomorrow"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 142192
Data columns (total 19 columns):
day_month_date    112925 non-null object
Location          112925 non-null object
MinTemp           112925 non-null float64
MaxTemp           112925 non-null float64
Rainfall          112925 non-null float64
WindGustDir       112925 non-null object
WindGustSpeed     112925 non-null float64
WindDir9am        112925 non-null object
WindDir3pm        112925 non-null object
WindSpeed9am      112925 non-null float64
WindSpeed3pm      112925 non-null float64
Humidity9am       112925 non-null float64
Humidity3pm       112925 non-null float64
Pressure9am       112925 non-null float64
Pressure3pm       112925 non-null float64
Temp9am           112925 non-null float64
Temp3pm           112925 non-null float64
RainToday         112925 non-null object
RainTomorrow      112925 non-null object
dtypes: float64(12), object(7)
memory usage: 17.2+ MB


# Define Methods

In [6]:
def plot_learning_curve(
    train_sizes,
    train_scores,
    test_scores,
    title="Learning Curve",
    y_title="Score",
    x_title="Training Examples",
    download_file_name="plot_learning_curve",
):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    fig = go.Figure()
    
    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean + test_scores_std,
            mode="lines",
            line=dict(color="green", width=1),
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean - test_scores_std,
            mode="lines",
            line=dict(color="green", width=1),
            showlegend=False,
            fill="tonexty",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean + train_scores_std,
            mode="lines",
            line=dict(color="red", width=1),
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean - train_scores_std,
            mode="lines",
            line=dict(color="red", width=1),
            showlegend=False,
            fill="tonexty",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean,
            marker=dict(color="green"),
            name="Cross-Validation Score",
            showlegend=True,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean,
            marker=dict(color="red"),
            name="Training Score",
            showlegend=True,
        )
    )

    fig.update_layout(
        title=title,
        xaxis=dict(title=x_title, showgrid=True, gridwidth=1, gridcolor="LightPink"),
        yaxis=dict(title=y_title, showgrid=True, gridwidth=1, gridcolor="LightPink"),
    )
    fig.write_image(f"{figure_export_path}/{download_file_name}.png")
    fig.show()

def plot_validation_curve(
    param_range,
    train_scores,
    test_scores,
    title="Validation Curve",
    y_title="Score",
    x_title="Param Range",
    download_file_name="plot_validation_curve",

):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=test_scores_mean + test_scores_std,
            mode="lines",
            line=dict(color="green", width=1),
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=test_scores_mean - test_scores_std,
            mode="lines",
            line=dict(color="green", width=1),
            showlegend=False,
            fill="tonexty",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=train_scores_mean + train_scores_std,
            mode="lines",
            line=dict(color="red", width=1),
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=train_scores_mean - train_scores_std,
            mode="lines",
            line=dict(color="red", width=1),
            showlegend=False,
            fill="tonexty",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=test_scores_mean,
            marker=dict(color="green"),
            name="Cross-Validation Score",
            showlegend=True,
        )
    )

    fig.add_trace(
        go.Scatter(
            x=param_range,
            y=train_scores_mean,
            marker=dict(color="red"),
            name="Training Score",
            showlegend=True,
        )
    )

    fig.update_layout(
        title=title,
        xaxis=dict(title=x_title, showgrid=True, gridwidth=1, gridcolor="LightPink"),
        yaxis=dict(title=y_title, showgrid=True, gridwidth=1, gridcolor="LightPink"),
    )
    
    fig.write_image(f"{figure_export_path}/{download_file_name}.png")
    fig.show()

## Train Test Split

In [7]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(
    X, y, test_size=0.30, random_state=2
)

# NN

In [8]:
from sklearn.neural_network import MLPClassifier

In [9]:
%%time
solver = "sgd"
hidden_layer_sizes = (2, 10)
activation = "logistic"
nn_clf = MLPClassifier(
    solver=solver,
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation,
    max_iter=1000,
)
nn_clf.fit(X_trainset, y_trainset)

CPU times: user 22.8 s, sys: 1.46 s, total: 24.3 s
Wall time: 4.56 s


MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(2, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
cross_val_score(nn_clf, X_trainset, y_trainset, cv=5)

array([0.77787616, 0.77791132, 0.77791132, 0.77791132, 0.77791132])

In [12]:
nn_pred = nn_clf.predict(X_testset)
print(nn_clf.score(X_testset, y_testset))
print(classification_report(y_testset, nn_pred))

0.7797095460180649



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.



              precision    recall  f1-score   support

          No       0.78      1.00      0.88     26415
         Yes       0.00      0.00      0.00      7463

    accuracy                           0.78     33878
   macro avg       0.39      0.50      0.44     33878
weighted avg       0.61      0.78      0.68     33878



# Plot Learning Curve

In [17]:
train_sizes = np.linspace(round(X_trainset.shape[0]* .1), X_trainset.shape[0] * .7, num=10).astype(int)
train_sizes

array([ 7905, 13174, 18444, 23714, 28984, 34253, 39523, 44793, 50063,
       55332])

In [18]:
train_sizes, train_scores, test_scores = learning_curve(
    nn_clf,
    X_trainset,
    y_trainset,
    cv=5,
    n_jobs=-1,
    train_sizes=train_sizes
)

In [20]:
plot_learning_curve(
    train_sizes,
    train_scores,
    test_scores,
    title="ANN Learning Curve (Rain)",
    x_title="Training Examples",
    y_title="Score (Accuracy %)",
    download_file_name="ann_learning_curve",
)

## Validation Curve

In [21]:
param_range = np.linspace(0.0000001, .01, num=10)
ne_train_scores, ne_test_scores = validation_curve(
    nn_clf,
    X_trainset,
    y_trainset,
    "alpha",
    param_range,
    scoring="accuracy",
    cv=5,
)

In [23]:
plot_validation_curve(
    param_range,
    ne_train_scores,
    ne_test_scores,
    "'alpha' validation curve",
    x_title='alpha',
    y_title="'alpha' validation curve",
    download_file_name="ann_alpha_vcurve",
)

## Hidden Layer Sizes

In [27]:
param_range = np.arange(5, 100, 20)
ne_train_scores, ne_test_scores = validation_curve(
    nn_clf,
    X_trainset,
    y_trainset,
    "hidden_layer_sizes",
    param_range,
    scoring="accuracy",
    cv=5,
)

In [28]:
plot_validation_curve(
    param_range,
    ne_train_scores,
    ne_test_scores,
    "'hidden_layer_sizes' validation curve",
    x_title="hidden_layer_sizes",
    y_title="Score (Accuracy %)",
    download_file_name="ann_alpha_vcurve",
)

# Grid Search

In [31]:
grid_params = {
    "alpha": [0.004, 0.005],
    "hidden_layer_sizes": np.linspace(80, 120, num=10).astype(int),
    "max_iter": np.arange(500, 6000, 500),
}
grid = GridSearchCV(
    estimator=nn_clf, param_grid=grid_params, scoring="accuracy", cv=5, n_jobs=-1
)
grid.fit(X_trainset, y_trainset)
grid.best_params_

KeyboardInterrupt: 

In [None]:
nn_clf_final = MLPClassifier(
    solver=solver,
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation,
    max_iter=1000,
    alpha=0.005
)
nn_clf_final.fit(X_trainset, y_trainset)

# Final Score

In [19]:
nn_pred = nn_clf_final.predict(X_testset)
print(nn_clf_final.score(X_testset, y_testset))
print(classification_report(y_testset, nn_pred))

0.10059338796270133
               precision    recall  f1-score   support

        Acura       0.00      0.00      0.00        85
 Aston Martin       0.00      0.00      0.00        34
         Audi       0.00      0.00      0.00        89
          BMW       0.00      0.00      0.00        91
      Bentley       0.00      0.00      0.00        31
        Buick       0.00      0.00      0.00        54
     Cadillac       0.00      0.00      0.00       117
    Chevrolet       0.10      1.00      0.18       356
     Chrysler       0.00      0.00      0.00        64
        Dodge       0.00      0.00      0.00       188
         FIAT       0.00      0.00      0.00        15
      Ferrari       0.00      0.00      0.00        16
         Ford       0.00      0.00      0.00       237
          GMC       0.00      0.00      0.00       156
       HUMMER       0.00      0.00      0.00         5
        Honda       0.00      0.00      0.00       145
      Hyundai       0.00      0.00      0.00


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

