This is the file for Dataset 1, "Covertype."  We will be classifying the type of trees in the forest based on the other variables.

In [1]:
# Helpful Variables
mlp_filename = "./models/mlp.pkl"

In [2]:
import pandas as pd
import joblib
from pathlib import Path
from ucimlrepo import fetch_ucirepo 

# Load the Covertype dataset

covertype_features_filename = "./data/covertype_features.pkl"
covertype_targets_filename = "./data/covertype_targets.pkl"
path = Path(covertype_features_filename)

if not path.is_file():
    # download the dataset. It will take about a minute.
    print("Downloading dataset")
    covertype = fetch_ucirepo(id=31) 
    
    joblib.dump(covertype.data.features, covertype_features_filename)
    joblib.dump(covertype.data.targets, covertype_targets_filename)

# Load the covertype dataset
covertype_features = joblib.load(covertype_features_filename)
covertype_targets = joblib.load(covertype_targets_filename)

covertype_features.head()


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [3]:

# First, let's inspect the data we have to work with
print(f"Features: {list(covertype_features.columns)}")
print(f"Target: {list(covertype_targets.columns)}")

# Have a variety of pieces of information about the 30x30 meter forest cells
print("Head of data:")

# And the number of entries
print(f"We have {covertype_features.shape[0]} entries for {covertype_features.shape[1]} features")



Features: ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']
Target: ['Cover_Type']
Head of data:
We have 581012 entries for 54 features


In [4]:
# We also want to clean up our data so it is workable

# First, combine into same dataframe
df = covertype_features
df['Cover_Type'] = covertype_targets

# Note that there are no missing datapoints, so nothing needs to be dropped/fixed
print(f"Number of NaN Values: {df.isna().sum().sum()}")

# Also, all of the values are numeric so we don't need to fix that either

# Finally, separate into X and y
X = df.drop(columns=["Cover_Type"])
y = df["Cover_Type"]

Number of NaN Values: 0


In [47]:
# Now, need to rescale
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to pandas dataframe
df_scaled = pd.DataFrame(data=X_scaled, columns=X.columns)
df_scaled["Cover_Type"] = y
df_scaled.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Cover_Type
0,-1.297805,-0.935157,-1.48282,-0.053767,-0.796273,-1.180146,0.330743,0.439143,0.14296,3.246283,...,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-0.232859,-0.879364,-0.260673,5
1,-1.319235,-0.89048,-1.616363,-0.270188,-0.899197,-1.257106,0.293388,0.590899,0.221342,3.205504,...,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-0.232859,-0.879364,-0.260673,5
2,-0.554907,-0.148836,-0.681563,-0.006719,0.318742,0.532212,0.816364,0.742654,-0.196691,3.126965,...,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-0.232859,-0.879364,-0.260673,2
3,-0.622768,-0.005869,0.520322,-0.129044,1.227908,0.474492,0.965786,0.742654,-0.536343,3.194931,...,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-0.232859,-0.879364,-0.260673,2
4,-1.301377,-0.98877,-1.616363,-0.547771,-0.813427,-1.256464,0.293388,0.540313,0.195215,3.165479,...,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-0.232859,-0.879364,-0.260673,5


In [66]:
# Also, let's split the data into training and testing sets for ease of work
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# These next two blocks lets us validate the stratification worked

y_train.value_counts()

Cover_Type
2    226640
1    169472
3     28603
7     16408
6     13894
5      7594
4      2198
Name: count, dtype: int64

In [68]:
y_test.value_counts()

Cover_Type
2    56661
1    42368
3     7151
7     4102
6     3473
5     1899
4      549
Name: count, dtype: int64

## Neural Network
Our task is to use classify the cover type (type of trees) based on the other features.  First, we will use a neural network.

In [61]:
# First, we'll train a neural network
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a neural network classifier using scikit-learn's MLPClassifier
mlp = MLPClassifier(
    hidden_layer_sizes=(10,2),
    max_iter=200,
    activation="relu",
    random_state=42,
    solver="sgd",
    verbose=1,
    tol=1e-4,
    learning_rate_init=0.1,
)

# Fit the model to the training data
mlp.fit(X_train, y_train)

# Save the model to a file
joblib.dump(mlp,mlp_filename)

Iteration 1, loss = 0.69821241
Iteration 2, loss = 0.64776613
Iteration 3, loss = 0.64135935
Iteration 4, loss = 0.63892839
Iteration 5, loss = 0.63674306
Iteration 6, loss = 0.63591376
Iteration 7, loss = 0.63460498
Iteration 8, loss = 0.63402842
Iteration 9, loss = 0.63399669
Iteration 10, loss = 0.62859648
Iteration 11, loss = 0.62464987
Iteration 12, loss = 0.62321623
Iteration 13, loss = 0.62136869
Iteration 14, loss = 0.62047312
Iteration 15, loss = 0.61938512
Iteration 16, loss = 0.61891247
Iteration 17, loss = 0.61885988
Iteration 18, loss = 0.61897684
Iteration 19, loss = 0.61903078
Iteration 20, loss = 0.61898713
Iteration 21, loss = 0.61852144
Iteration 22, loss = 0.61819524
Iteration 23, loss = 0.61829538
Iteration 24, loss = 0.61758058
Iteration 25, loss = 0.61764603
Iteration 26, loss = 0.61702467
Iteration 27, loss = 0.61656109
Iteration 28, loss = 0.61649306
Iteration 29, loss = 0.61718952
Iteration 30, loss = 0.61696904
Iteration 31, loss = 0.61702121
Iteration 32, los

['./models/mlp.pkl']

In [70]:
# Now, evaluate the model
mlp = joblib.load(mlp_filename)

# Evaluate the performance on the training set
y_pred_train = mlp.predict(X_train)
accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {accuracy * 100:.2f}")

# Evaluate the performance on the test set
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Testing Accuracy: {accuracy * 100:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Training Accuracy: 67.89
Testing Accuracy: 67.84
Classification Report:
               precision    recall  f1-score   support

           1       0.61      0.83      0.70     42368
           2       0.78      0.62      0.69     56661
           3       0.78      0.56      0.66      7151
           4       0.62      0.21      0.31       549
           5       0.00      0.00      0.00      1899
           6       0.37      0.52      0.43      3473
           7       0.73      0.59      0.66      4102

    accuracy                           0.68    116203
   macro avg       0.56      0.48      0.49    116203
weighted avg       0.69      0.68      0.67    116203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Here we can validate that 

import numpy as np
np.unique(y_pred, return_counts=True)

(array([1, 2, 3, 4, 6, 7], dtype=int64),
 array([57493, 45195,  5159,   186,  4853,  3317], dtype=int64))