# Cover type classifier research project
This notebook consists of the research of classical and modern neural networks researching the classification of covertypes of different terrain. The data set is from UCI, and consists of quantative and qualatative values of the environment. Details can be found in covtype.info

## Outline of the data:
No missing values

Number of instances: 581,012

Number of Attributes: 12 measures, but 54 columns of data (10 quantitative variables, 4 binary wilderness areas and 40 binary soil type variables)

### Forest Cover Type Classes:	
1 -- Spruce/Fir

2 -- Lodgepole Pine

3 -- Ponderosa Pine

4 -- Cottonwood/Willow

5 -- Aspen

6 -- Douglas-fir

7 -- Krummholz             

## Generate helping functions

In [2]:
import pandas as pd
import json
import sys
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


RANDOM_STATE = 77
DATA_SET = "covertype/covtype.data"
TEST_SIZE = 0.2

def split_data(data_set):
    # load the data set, string input data_set = "dataset.csv"
    df = pd.read_csv(data_set, header = None) 
    df.columns = ["Elevation",
                  "Aspect",
                  "Slope",
                  "Horizontal_Distance_To_Hydrology",
                  "Vertical_Distance_To_Hydrology",
                  "Horizontal_Distance_To_Roadways",
                  "Hillshade_9am",
                  "Hillshade_Noon",
                  "Hillshade_3pm",
                  "Horizontal_Distance_To_Fire_Points",
                  "Wilderness_Area1",
                  "Wilderness_Area2",
                  "Wilderness_Area3",
                  "Wilderness_Area4",
                  "Soil_Type1",
                  "Soil_Type2",
                  "Soil_Type3",
                  "Soil_Type4",
                  "Soil_Type5",
                  "Soil_Type6",
                  "Soil_Type7",
                  "Soil_Type8",
                  "Soil_Type9",
                  "Soil_Type10",
                  "Soil_Type11",
                  "Soil_Type12",
                  "Soil_Type13",
                  "Soil_Type14",
                  "Soil_Type15",
                  "Soil_Type16",
                  "Soil_Type17",
                  "Soil_Type18",
                  "Soil_Type19",
                  "Soil_Type20",
                  "Soil_Type21",
                  "Soil_Type22",
                  "Soil_Type23",
                  "Soil_Type24",
                  "Soil_Type25",
                  "Soil_Type26",
                  "Soil_Type27",
                  "Soil_Type28",
                  "Soil_Type29",
                  "Soil_Type30",
                  "Soil_Type31",
                  "Soil_Type32",
                  "Soil_Type33",
                  "Soil_Type34",
                  "Soil_Type35",
                  "Soil_Type36",
                  "Soil_Type37",
                  "Soil_Type38",
                  "Soil_Type39",
                  "Soil_Type40",
                  "Cover_Type"]
    df_train, df_test= train_test_split(df, test_size = TEST_SIZE, random_state = RANDOM_STATE, shuffle = True)
    X_train = df_train.iloc[:, :-1]     # train features
    Y_train = df_train["Cover_Type"]    # train labels
    X_test = df_test.iloc[:, :-1]       # test features
    Y_test = df_test["Cover_Type"]      # test labels

    return X_train, Y_train, X_test, Y_test
    
def knn_trainer(training_features, training_labels):
    # n_neighbours = 5, unscaled training
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(training_features, training_labels)
    return knn


def tree_trainer(training_features, training_labels):
    dtc = DecisionTreeClassifier(random_state = RANDOM_STATE, criterion = "entropy")
    dtc.fit(training_features, training_labels)
    return dtc

def svm_trainer(training_features, training_labels):
    svm = SVC(kernel = "linear", C = 0.01)
    svm.fit(training_features, training_labels)
    return svm
    
def gnb_trainer(training_features, training_labels):
    gnb = GaussianNB()
    gnb.fit(training_features, training_labels)
    return gnb

def mlp_trainer(training_features, training_labels):
    mlp = MLPClassifier(max_iter = 500, random_state = RANDOM_STATE)
    mlp.fit(training_features, training_labels)
    return mlp

In [19]:
# Split the data and scale it
df_train_features, df_train_labels, df_test_features, df_test_labels = split_data(DATA_SET)
scaler = StandardScaler()
df_train_features = scaler.fit_transform(df_train_features)    # fit transform fits "scalar" to the training set and then transforms it.
df_test_features = scaler.transform(df_test_features)          # transform just applies the scalar "scalar"

print("Done")

Done


## Train the models

In [2]:
time1 = time.time()
knn = knn_trainer(df_train_features, df_train_labels)
time2 = time.time()

print(f"knn training complete, time taken: {(time2-time1):.2f}")

knn training complete, time taken: 0.19


In [29]:
time1 = time.time()
tree = tree_trainer(df_train_features,df_train_labels)
time2 = time.time()

print(f"tree training complete, time taken: {(time2-time1):.2f}")

tree training complete, time taken: 8.61


In [4]:
# unscaled   time taken=37.56s C=0.01 #instances=1000
# scaled     time taken=3.99s C=0.01 #instances=100000

instances = 10000 #len(df_train_features)
print(f"svm #instances trained on: {instances:.2f}")

time1 = time.time()
svm = svm_trainer(df_train_features[:instances,:], df_train_labels[:instances])
time2 = time.time()

print(f"svm training complete, time taken: {(time2-time1):.2f}")

svm #instances trained on: 10000.00
svm training complete, time taken: 3.99


In [11]:
time1 = time.time()
gnb = gnb_trainer(df_train_features, df_train_labels)
time2 = time.time()

print(f"gnb training complete, time taken: {(time2-time1):.2f}")

gnb training complete, time taken: 0.79


In [25]:
# time taken=2.19s  #instances=1000
# time taken=24.43s #instances=10000

instances = 10000 #len(df_train_features)
print(f"mlp #instances trained on: {instances:.2f}")

time1 = time.time()
mlp = mlp_trainer(df_train_features[:instances,:], df_train_labels[:instances])
time2 = time.time()

print(f"mlp training complete, time taken: {(time2-time1):.2f}")

mlp #instances trained on: 10000.00
mlp training complete, time taken: 24.43


## Test the models

In [25]:
print("knn acc: ")

# for scaled data
print(knn.score(df_test_features[:1000,:],df_test_labels[:1000]))

# for unscaled data
# print(knn.score(df_test_features.iloc[:1000,:],df_test_labels.iloc[:1000]))

# unscaled acc   97.4%
# scaled acc     93.7%

knn acc: 
0.937


In [30]:
print("tree acc: ")

# for scaled data
print(tree.score(df_test_features[:1000,:],df_test_labels[:1000]))

# for unscaled data
# print(tree.score(df_test_features.iloc[:1000,:],df_test_labels.iloc[:1000]))

# unscaled acc   95.1%
# scaled acc     95.2%

tree acc: 
0.952


In [7]:
print("svm acc: ")

# for scaled data
print(svm.score(df_test_features[:1000,:],df_test_labels[:1000]))

# for unscaled data
# print(svm.score(df_test_features.iloc[:1000,:],df_test_labels.iloc[:1000]))


# unscaled acc   72.3%
# scaled acc     72.9%

svm acc: 
0.729


In [12]:
print("gnb acc: ")

# for scaled data
print(gnb.score(df_test_features[:1000,:],df_test_labels[:1000]))

# for unscaled data
# print(tree.score(df_test_features.iloc[:1000,:],df_test_labels.iloc[:1000]))

# unscaled acc   45.5%
# scaled acc     8.2%

gnb acc: 
0.082


In [26]:
print("mlp acc: ")
print(mlp.score(df_test_features[:1000,:],df_test_labels[:1000]))

# unscaled acc   60.4%
# scaled acc     79.0%

mlp acc: 
0.79


In [21]:
print(len(df_test_features))


116203


## Install packages

In [1]:
pip install pandas
pip install scikit-learn

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.3-cp313-cp313-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl (11.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-2.3.3-cp313-cp313-macosx_14_0_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, 