<a href="https://colab.research.google.com/github/dcavin7/forest-cover/blob/main/Forest_Cover_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this project is to develop a neural network model to predict forest cover type based on cartographic variables. The data available is listed below (source: Codecademy):

* Elevation / quantitative /meters / Elevation in meters
* Aspect / quantitative / azimuth / Aspect in degrees azimuth
* Slope / quantitative / degrees / Slope in degrees
* Horizontal_Distance_To_Hydrology / quantitative / meters / Horz Dist to nearest surface water features
* Vertical_Distance_To_Hydrology / quantitative / meters / Vert Dist to nearest surface water features
* Horizontal_Distance_To_Roadways / quantitative / meters / Horz Dist to nearest roadway
* Hillshade_9am / quantitative / 0 to 255 index / Hillshade index at 9am, summer solstice
* Hillshade_Noon / quantitative / 0 to 255 index / Hillshade index at noon, summer solstice
* Hillshade_3pm / quantitative / 0 to 255 index / Hillshade index at 3pm, summer solstice
* Horizontal_Distance_To_Fire_Points / quantitative / meters / Horz Dist to nearest wildfire ignition points
* Wilderness_Area (4 binary columns) / qualitative / 0 (absence) or 1 (presence) / Wilderness area designation
* Soil_Type (40 binary columns) / qualitative / 0 (absence) or 1 (presence) / Soil Type designation
* Cover_Type (7 types) / integer / 1 to 7 / Forest Cover Type designation

# Setup

In [45]:
# Importing necessary packages
import tensorflow as tf
import sklearn
import pandas as pd
from google.colab import files
import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

In [46]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['cover_data.csv']))

Saving cover_data.csv to cover_data (1).csv


In [73]:
# Covertypes corresponding to numerical labels
y_labels = ["Spruce/Fir",
            "Lodgepole Pine",
            "Ponderosa Pine",
            "Cottonwood/Willow",
            "Aspen",
            "Douglas-fir",
            "Krummholz"]

# Data exploration

In [None]:
# Column summary statistics
print(df.describe())

# Output cleared for cleanliness

In [48]:
# Column attributes / data types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           581012 non-null  int64
 1   Aspect                              581012 non-null  int64
 2   Slope                               581012 non-null  int64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  int64
 4   Vertical_Distance_To_Hydrology      581012 non-null  int64
 5   Horizontal_Distance_To_Roadways     581012 non-null  int64
 6   Hillshade_9am                       581012 non-null  int64
 7   Hillshade_Noon                      581012 non-null  int64
 8   Hillshade_3pm                       581012 non-null  int64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  int64
 10  Wilderness_Area1                    581012 non-null  int64
 11  Wilderness_Area2                    581012 non-null 

# Preprocessing

In [49]:
# Splitting features (X) and labels (y)
X = df.iloc[:,0:-1] # All columns except last (class)
y = df.iloc[:,-1] # Only class column

In [50]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y)

# Undersampling
rus = RandomUnderSampler(random_state = 10)
# Using undersampling to correct imbalanced dataset (training data only)
X_train, y_train = rus.fit_resample(X_train, y_train)

# Shuffling
# To prevent bias in training
xy_train = pd.concat([X_train, y_train], join = 'inner', axis = 1) # Join x and y to shuffle
xy_train = xy_train.sample(frac = 1, random_state = 100) # Shuffle
# Re-separating the data
X_train = xy_train.iloc[:,0:-1] # All columns except last (class)
y_train = xy_train.iloc[:,-1] # Only class column

In [51]:
# Coding y as 0-6 instead of 1-7
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_train = pd.DataFrame(y_train)

y_test = le.transform(y_test)
y_test = pd.DataFrame(y_test)

In [52]:
# Scaling
column_trans = ColumnTransformer(
    [('scaler', StandardScaler(), slice(0, 10))],
    remainder = 'passthrough')
# Only qualitative columns are scaled. Categorical columns are left as-is.

X_train_scaled = column_trans.fit_transform(X_train)
X_test_scaled = column_trans.transform(X_test)

# Converting back to Pandas dataframe
X_columns = X.columns
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_columns)

# Building the Model

In [122]:
# Initializing the model
model = tf.keras.models.Sequential()

# Input
model.add(tf.keras.Input(shape = (X_train_scaled.shape[1], )))

# Hidden layer 1
model.add(tf.keras.layers.Dense(64, activation = 'relu'))

# Hidden layer 2
model.add(tf.keras.layers.Dense(32, activation = 'relu'))

# Hidden layer 3
model.add(tf.keras.layers.Dense(16, activation = 'relu'))

# Output
model.add(tf.keras.layers.Dense(7, activation = 'softmax'))

In [123]:
# Optimizer
opt = tf.keras.optimizers.Adam(learning_rate = 0.0005)

In [124]:
# Compiling model
model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer = opt,
              metrics = ['accuracy'])

In [125]:
# Fitting model
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy',
                                              mode = 'max',
                                              patience = 20,
                                              restore_best_weights = True)
model.fit(X_train_scaled, y_train,
          epochs = 100,
          batch_size = 5,
          verbose = 1,
          validation_split = 0.2,
          shuffle = True,
          callbacks = [early_stop]
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f2b0b6ff8d0>

In [127]:
# Scoring the model
y_pred = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred, axis = 1)
#y_true = np.argmax(y_test, axis = 0)
print(classification_report(y_test, y_pred, target_names = y_labels))

                   precision    recall  f1-score   support

       Spruce/Fir       0.79      0.68      0.73     21184
   Lodgepole Pine       0.80      0.73      0.76     28331
   Ponderosa Pine       0.75      0.76      0.75      3575
Cottonwood/Willow       0.43      0.99      0.60       275
            Aspen       0.26      0.92      0.40       949
      Douglas-fir       0.46      0.80      0.59      1737
        Krummholz       0.57      0.95      0.71      2051

         accuracy                           0.72     58102
        macro avg       0.58      0.83      0.65     58102
     weighted avg       0.76      0.72      0.73     58102



In [128]:
y_pred_df = pd.DataFrame(y_pred)
print(y_pred_df.value_counts())

1    25797
0    18189
2     3648
6     3435
4     3409
5     2986
3      638
dtype: int64
