In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Defining column names

col_names = ['Elevation',
'Aspect',
'Slope',
'Horizontal_Distance_To_Hydrology',
'Vertical_Distance_To_Hydrology',
'Horizontal_Distance_To_Roadways',
'Hillshade_9am',
'Hillshade_Noon',
'Hillshade_3pm',
'Horizontal_Distance_To_Fire_Points',
'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Wilderness_Area_4',
'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39', 'Soil_Type_40', 
'Cover_Type']

In [3]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"

dataset = pd.read_csv(filepath_or_buffer = data_url, compression = 'gzip', 
                      header = None, names = col_names)

In [4]:
dataset.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [5]:
dataset.shape

(581012, 55)

In [6]:
# Summary of dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
Elevation                             581012 non-null int64
Aspect                                581012 non-null int64
Slope                                 581012 non-null int64
Horizontal_Distance_To_Hydrology      581012 non-null int64
Vertical_Distance_To_Hydrology        581012 non-null int64
Horizontal_Distance_To_Roadways       581012 non-null int64
Hillshade_9am                         581012 non-null int64
Hillshade_Noon                        581012 non-null int64
Hillshade_3pm                         581012 non-null int64
Horizontal_Distance_To_Fire_Points    581012 non-null int64
Wilderness_Area_1                     581012 non-null int64
Wilderness_Area_2                     581012 non-null int64
Wilderness_Area_3                     581012 non-null int64
Wilderness_Area_4                     581012 non-null int64
Soil_Type_1                           581012 non-

###### Summary states that there are 
- 581012 rows
- 55 columns or features
- No null values
- Dtypes of all the entries is int64

In [7]:
# Defining independent and dependent variable

X = dataset.drop(columns = ['Cover_Type'], axis = 1)
y = dataset['Cover_Type']

In [8]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X[:] = sc_X.fit_transform(X[:])

In [9]:
# Splitting data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [10]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [11]:
%%time
# Creating and fitting the model
model_rf = RandomForestClassifier(n_estimators = 50, oob_score = True,
                              n_jobs = -1)
model_rf.fit(X_train, y_train)

# Predicting test set results
y_pred = model_rf.predict(X_test)

# Predicting training set results
x_pred = model_rf.predict(X_train)

# Out Of the Box score of this model
oob_score = model_rf.oob_score_

# Creating confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Training set accuracy
accuracy_x = accuracy_score(y_train, x_pred)

# Test set accuracy
accuracy_y = accuracy_score(y_test, y_pred)

# Print accuracies and OOB score
print(f'Training Set Accuracy = {accuracy_x:.3f} \tTest Set Accuracy = {accuracy_y:.3f}')

print(f'\nOOB Score = {oob_score:.3f}')

Training Set Accuracy = 1.000        	Test Set Accuracy = 0.953

OOB Score = 0.948
Wall time: 1min 7s


In [12]:
# Feature Importance
fi = model_rf.feature_importances_

# Convert fi array into dataframe
fi_df = pd.DataFrame(data = fi, 
                     index = X_train.columns, 
                     columns = ['Importance'])

In [13]:
fi_df.sort_values(by = 'Importance', ascending = False)

Unnamed: 0,Importance
Elevation,0.243212
Horizontal_Distance_To_Roadways,0.118065
Horizontal_Distance_To_Fire_Points,0.109321
Horizontal_Distance_To_Hydrology,0.060857
Vertical_Distance_To_Hydrology,0.057249
Aspect,0.047634
Hillshade_Noon,0.043833
Hillshade_9am,0.041434
Hillshade_3pm,0.041426
Slope,0.032452


In [14]:
# Display confusion matrix as a dataframe
cm_df = pd.DataFrame(data = cm, 
             index = ['Actual Covertype 1', 'Actual Covertype 2', 'Actual Covertype 3', 'Actual Covertype 4', 'Actual Covertype 5', 'Actual Covertype 6', 'Actual Covertype 7'], 
             columns = ['Predicted Covertype 1', 'Predicted Covertype 2', 'Predicted Covertype 3', 'Predicted Covertype 4', 'Predicted Covertype 5', 'Predicted Covertype 6', 'Predicted Covertype 7']
            )
cm_df

Unnamed: 0,Predicted Covertype 1,Predicted Covertype 2,Predicted Covertype 3,Predicted Covertype 4,Predicted Covertype 5,Predicted Covertype 6,Predicted Covertype 7
Actual Covertype 1,49945,2840,1,0,31,3,117
Actual Covertype 2,1696,68704,167,1,88,100,25
Actual Covertype 3,1,145,8637,29,8,189,0
Actual Covertype 4,0,0,86,585,0,14,0
Actual Covertype 5,35,496,27,0,1785,9,0
Actual Covertype 6,6,121,336,22,2,3881,0
Actual Covertype 7,250,25,0,0,2,0,4844


In [15]:
# Classification Report
class_names = ['Cover Type 1', 'Cover Type 2', 'Cover Type 3', 
               'Cover Type 4', 'Cover Type 5', 'Cover Type 6', 
               'Cover Type 7']

c_report = classification_report(y_true = y_test, y_pred = y_pred, target_names = class_names, output_dict = True)

In [16]:
report_df = pd.DataFrame(data = c_report)
report_df

Unnamed: 0,Cover Type 1,Cover Type 2,Cover Type 3,Cover Type 4,Cover Type 5,Cover Type 6,Cover Type 7,accuracy,macro avg,weighted avg
f1-score,0.952513,0.960143,0.945847,0.885023,0.836457,0.906352,0.958544,0.952689,0.920697,0.952444
precision,0.96172,0.949856,0.933326,0.918367,0.931628,0.924929,0.97152,0.952689,0.941621,0.952725
recall,0.94348,0.970656,0.958708,0.854015,0.758929,0.888507,0.945909,0.952689,0.902886,0.952689
support,52937.0,70781.0,9009.0,685.0,2352.0,4368.0,5121.0,0.952689,145253.0,145253.0


In [17]:
%%time

# Calculating cross validation scores

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = model_rf, 
                             X = X_train, y = y_train, 
                             cv = 5, 
                             n_jobs = -1)

print("Cross Validation (5 fold)")
print("=" * 50)
print(f'Mean Accuracy = {accuracies.mean() * 100:.3f}%')
print(f'Variance = {accuracies.std() * 100:.3f}%')

Cross Validation (5 fold)
Mean Accuracy = 94.689%
Variance = 0.064%
Wall time: 3min 24s


###### After evaluating model's OOB score, Confusion Matix, Classification Report and Cross Validation Scores, we can conclude that the model is performing well.