In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestClassifier

In [2]:
#!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
covertype = fetch_ucirepo(id=31)

# data (as pandas dataframes)
X = covertype.data.features
y = covertype.data.targets

df_combined = X.copy()
df_combined['Cover_Type'] = y['Cover_Type']

# metadata
print(covertype.metadata)

# variable information
print(covertype.variables)

{'uci_id': 31, 'name': 'Covertype', 'repository_url': 'https://archive.ics.uci.edu/dataset/31/covertype', 'data_url': 'https://archive.ics.uci.edu/static/public/31/data.csv', 'abstract': 'Classification of pixels into 7 forest cover types based on attributes such as elevation, aspect, slope, hillshade, soil-type, and more.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 581012, 'num_features': 54, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['Cover_Type'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C50K5N', 'creators': ['Jock Blackard'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting forest cover type from cartographic variables only (no remotely sensed data).  The actual forest cover type for a given observation (30 x 30 meter cell) was determined from

# Check-in Week 8/9/10

Here we will use a neural network for multiclass classification. The response variable here is the forest cover type

For this dataset, we will use a normal deep neural network



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

# # Uses GPU runtime
# device = torch.device("cuda:0")
loss_function = nn.CrossEntropyLoss()

# Extremely simple model architecture
model = nn.Sequential()

# Input layer
model.add_module("Input", nn.Linear(in_features = X.shape[1], out_features = 1024))

# Hidden layers
model.add_module("Hidden Layer 1", nn.Linear(in_features=1024, out_features=1024))
model.add_module("Activation 1", nn.Sigmoid())
model.add_module("Hidden Layer 2", nn.Linear(in_features=1024, out_features=1024))  
model.add_module("Activation 2", nn.Sigmoid())

# Add dropout for regularization
model.add_module("Dropout", nn.Dropout(p=0.5))

# Output layer
model.add_module("Output", nn.Linear(in_features=1024, out_features = len(np.unique(y))))
# model.to(device)


In [4]:
batch_size = 100

# Shift the y labels to start from 0 (if this doesn't happen, PyTorch can't train)
y = y - 1

# Convert Trees Pandas DataFrame to Torch Tensors
# Scaled the data to minimize effect of features with larger ranges (such as distance to hydrology)
X_tensor = torch.tensor(StandardScaler().fit_transform(X.to_numpy()), dtype=torch.float32)
y_tensor = torch.tensor(y.to_numpy().flatten(), dtype=torch.long)

# Split dataset into training, validation and testing sets
# Stratify the y-data to preserve class distribution
X_train, X_temp, y_train, y_temp = train_test_split(X_tensor, y_tensor, test_size=0.3, random_state=42, stratify=y_tensor)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

# Convert to DataLoader for batching
train_dl = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
val_dl = DataLoader(val_dataset, batch_size = batch_size, shuffle=False)
test_dl = DataLoader(test_dataset, batch_size = batch_size, shuffle=False)

In [None]:
num_epochs = 10

# We will use the cross entropy loss function for multiclass
loss_function = nn.CrossEntropyLoss()

# Use standard optimizer, Adam optimizer for backpropagation calculations
# Fix the learning rate for now
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

# Our training function (just the same thing from the MNIST notebook)
def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    accuracy_hist_valid = [0] * num_epochs

    # Form 
    for epoch in range(num_epochs):
        model.train()
        for x_batch, y_batch in train_dl:
            # x_batch = x_batch.to(device)
            # y_batch = y_batch.to(device)
            pred = model(x_batch)
            loss = loss_function(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum().cpu()

        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)

        model.eval()
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                # x_batch = x_batch.to(device)
                # y_batch = y_batch.to(device)
                pred = model(x_batch)
                loss = loss_function(pred, y_batch)
                loss_hist_valid[epoch] += loss.item() * y_batch.size(0)
                is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
                accuracy_hist_valid[epoch] += is_correct.sum().cpu()

        loss_hist_valid[epoch] /= len(valid_dl.dataset)
        accuracy_hist_valid[epoch] /= len(valid_dl.dataset)

        print(f'Epoch {epoch+1} accuracy: {accuracy_hist_train[epoch]:.4f} val_accuracy: {accuracy_hist_valid[epoch]:.4f}')
    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

torch.manual_seed(1)
hist_train_validation = train(model, num_epochs, train_dl, val_dl)

Epoch 1 accuracy: 0.7828 val_accuracy: 0.8508
Epoch 2 accuracy: 0.8621 val_accuracy: 0.8769
Epoch 3 accuracy: 0.8825 val_accuracy: 0.8932
Epoch 4 accuracy: 0.8916 val_accuracy: 0.8968
Epoch 5 accuracy: 0.8968 val_accuracy: 0.9048
Epoch 6 accuracy: 0.9007 val_accuracy: 0.9073
Epoch 7 accuracy: 0.9034 val_accuracy: 0.9082
Epoch 8 accuracy: 0.9061 val_accuracy: 0.9106
Epoch 9 accuracy: 0.9065 val_accuracy: 0.9066


For our neural network training, we used crossentropyloss as the standard loss function as it combines logsoftmax to convert our logits into probabilities and measures how well the predicted probabilities match the true class labels.

Since we have 7 forest cover types, using CrossEntropyLoss penalizes confident wrong predictions more heavily than uncertain ones and we can see that the loss decreases as the epochs increases.

To preprocess our data, we normalized our features using standard scaling to ensure that all features have a mean of 0 and a standard deviation of 1. This helps the neural network converge faster and improves overall performance.

For our accuracy, we need it to detect overfitting by tracking it separately on our training and validation sets.

For our learning rate, we decided with learning rate of 0.001 being the best with fast convergence, stable training epochs, and a solid final validation accuracy.

For our hyperparameter tuning, we chose SGD from class with momentum to get convergence and was a solid choice for neural network training. For our batch size, we chose 100 as we wanted to have faster updates with generalization and 10 epochs to see model behavior after convergence.

To complete everything, we split it into a training of 70, validation of 15, and test of 15 using PyTorch's default initialization. Our forward pass computes the predictions and CrossEntropyLoss calculates the loss and our backward pass computing the gradient with SGD as our optimization.