In [16]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
from sklearn import compose, datasets, linear_model, metrics, model_selection
from sklearn import pipeline, preprocessing, utils

# Softmax Regression with Scikit-Learn

In [None]:
datasets.fetch_covtype?

In [None]:
bunch = datasets.fetch_covtype(
    data_home="/content/sample_datasets",
)

In [None]:
print(bunch["DESCR"])

.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30×30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` and ``target`

In [2]:
features, targets = datasets.fetch_covtype(
    data_home="/content/sample_datasets",
    return_X_y=True,
    as_frame=True
)

In [3]:
features.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydrology      581012 non-null  float64
 5   Horizontal_Distance_To_Roadways     581012 non-null  float64
 6   Hillshade_9am                       581012 non-null  float64
 7   Hillshade_Noon                      581012 non-null  float64
 8   Hillshade_3pm                       581012 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64
 10  Wilderness_Area_0                   581012 non-null  float64
 11  Wilderness_Area_1         

### Train-test split

In [8]:
prng = np.random.RandomState(42)
train_features, test_features, train_target, test_target = (
    model_selection.train_test_split(
        features,
        targets,
        random_state=prng,
        stratify=targets,
        test_size=0.1
    )
)

## Preparing the data

In [9]:
feature_preprocessor = compose.ColumnTransformer(
    transformers=[
        (
            "standard_scaler",
            preprocessing.StandardScaler(),
            [
                "Elevation",
                "Aspect",
                "Slope",
                "Horizontal_Distance_To_Hydrology",
                "Vertical_Distance_To_Hydrology",
                "Horizontal_Distance_To_Roadways",
                "Hillshade_9am",
                "Hillshade_Noon",
                "Hillshade_3pm",
                "Horizontal_Distance_To_Fire_Points"
            ]
        )
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
).set_output(transform="pandas")

### Train using SGDClassifier

In [17]:
ml_pipeline = pipeline.make_pipeline(
    feature_preprocessor,
    linear_model.SGDClassifier(),
    verbose=True,
)
_ = ml_pipeline.fit(train_features, train_target)

[Pipeline] . (step 1 of 2) Processing columntransformer, total=   0.6s
[Pipeline] ..... (step 2 of 2) Processing sgdclassifier, total=  15.7s


### Assess Performance

In [18]:
train_predictions = ml_pipeline.predict(train_features)
report = metrics.classification_report(
    train_target,
    train_predictions
)
print(report)

              precision    recall  f1-score   support

           1       0.70      0.71      0.71    190656
           2       0.75      0.79      0.77    254970
           3       0.62      0.86      0.72     32179
           4       0.73      0.22      0.34      2472
           5       0.32      0.05      0.09      8544
           6       0.28      0.05      0.09     15630
           7       0.69      0.50      0.58     18459

    accuracy                           0.71    522910
   macro avg       0.59      0.45      0.47    522910
weighted avg       0.70      0.71      0.70    522910



In [19]:
test_predictions = ml_pipeline.predict(test_features)
report = metrics.classification_report(
    test_target,
    test_predictions
)
print(report)

              precision    recall  f1-score   support

           1       0.70      0.70      0.70     21184
           2       0.74      0.79      0.76     28331
           3       0.62      0.85      0.71      3575
           4       0.56      0.20      0.29       275
           5       0.28      0.05      0.08       949
           6       0.31      0.07      0.11      1737
           7       0.69      0.49      0.57      2051

    accuracy                           0.71     58102
   macro avg       0.56      0.45      0.46     58102
weighted avg       0.70      0.71      0.70     58102



### Exercise

Read through the documentation of `SGDClassifier`. Play around with the various tuning parameters and see if you can improve the performance of your classifier.

### Exercise

Analyze your model's errors using confusion matrices and interpret your results.