In [2]:
# importing needed libraries
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# First step which is to load and dividing data to X and Y
dfPath = r'data.csv'
assignmentData = pd.read_csv(dfPath)
print(assignmentData.head())

   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0       2596      51      3                               258   
1       2590      56      2                               212   
2       2804     139      9                               268   
3       2785     155     18                               242   
4       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0            221             232            148   
1            220             235            151   
2            234             238   

In [4]:
# Check if any rows have null values
null_rows = assignmentData.isnull().any(axis=1)
print('Number of rows with null values:', sum(null_rows)) #  There were no null values

Number of rows with null values: 0


In [6]:
# Check if there is any duplicate rows
duplicate_rows = assignmentData[assignmentData.duplicated()]
print("Number of duplicate rows: ", len(duplicate_rows)) # none where found

Number of duplicate rows:  0


In [7]:
X = assignmentData.drop(columns=['Cover_Type'])
y = assignmentData['Cover_Type']

print(X.head())
print(y.head())

   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0       2596      51      3                               258   
1       2590      56      2                               212   
2       2804     139      9                               268   
3       2785     155     18                               242   
4       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0            221             232            148   
1            220             235            151   
2            234             238   

In [8]:
# Second step : Divide data into train and test set. We chose test_size = 0.3 to get a 70-30 division.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, shuffle=True)


In [10]:
# Third step : It's time to preproccess
# As we checked there is no missing values and no duplicate ones.
# And all of our data is numerical, and there are no categorical columns that need to be converted to numerical values.
# So we should just check if scaling is needed.

# As we can see from the head our columns are not from the same range so we should scale the large ones

cols_to_scale = ['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology',# choose desired columns
                 'Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
                 'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']

Scaler = StandardScaler() #define our scaler
# scale our columns
# It's noteworthy to say that we choose our scaling parameters(mean and sd) from our train data as 
# we don't have access to the test data. And we use those parameters to scale our test data too.
Scaler.fit(X_train[cols_to_scale])
X_train[cols_to_scale] = Scaler.transform(X_train[cols_to_scale])
X_test[cols_to_scale] = Scaler.transform(X_test[cols_to_scale])

print(X_train.head())

        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
183380   0.702466 -0.784220  0.386543                         -0.683720   
561389  -1.431609  1.772207 -0.681631                         -1.069421   
370958   0.003004 -0.962991 -0.414587                          3.126252   
429888  -0.264648 -1.231147 -0.815152                         -1.125865   
170657  -0.821363 -0.650141  1.187673                         -0.561425   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
183380                       -0.315721                         1.946292   
561389                       -0.779104                        -0.152020   
370958                        2.533231                         0.192671   
429888                       -0.779104                        -0.348435   
170657                       -0.092610                        -1.409466   

        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
183380       0.853218       -0.977752     

In [13]:
# You Can Uncomment all of these to see the GridSearch work but it takes a lot of time and i myself used a server to get the answers.
# For saving time and making it less time_consuming for the reader i just set the best ones manually from the answers i got from the server.
# In the documentation, I will share the screenshots and all the data.


# Step 4: 5 fold validation
five_fold = KFold(n_splits=5, shuffle=True, random_state=10)

tree_parameters ={'max_depth': [20, 25],
                    'criterion' : ['entropy', 'log_loss'],
                    'min_samples_split': [5, 7],
                    'min_samples_leaf' : [1, 2]}

# Create a decision tree classifier model
# tree_model = DecisionTreeClassifier()

# # search for the best parameters
# tree_grid = GridSearchCV(tree_model, tree_parameters, cv=five_fold, error_score="raise")

# # Fit the GridSearchCV objects to your data
# tree_grid.fit(X_train, y_train)

# print("Best parameters for Decision Tree: ", tree_grid.best_params_)

# # Printing all othe options:
# results_df = pd.DataFrame(tree_grid.cv_results_)
# results_df = results_df[['params', 'mean_test_score', 'std_test_score']]
# results_df = results_df.sort_values(by=['mean_test_score'], ascending=False)
# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        'display.max_colwidth', None
#                        ):
#     print(results_df)

Best_tree = {'criterion': 'log_loss', 'min_samples_leaf': 1, 'min_samples_split': 5}


# Same thing for random forest
forest_parameters =  {'n_estimators': [100, 150],
                    'criterion' : ['entropy'],
                    'min_samples_split': [5, 7],
                    'min_samples_leaf' : [1, 2],
                    }

# # Create a random forest classifier model
# forest_model = RandomForestClassifier()
# forest_grid = GridSearchCV(forest_model, forest_parameters, cv=five_fold)
# forest_grid.fit(X_train, y_train)

# # Print the best parameters for each model

# print("Best parameters for Random Forest: ", forest_grid.best_params_)

# results_df = pd.DataFrame(forest_grid.cv_results_)
# results_df = results_df[['params', 'mean_test_score', 'std_test_score']]
# results_df = results_df.sort_values(by=['mean_test_score'], ascending=False)
# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        'display.max_colwidth', None
#                        ):
#     print(results_df)
Best_forest = {'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}

print(Best_tree)
print(Best_forest)

{'criterion': 'log_loss', 'min_samples_leaf': 1, 'min_samples_split': 5}
{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}


In [15]:
tree_model = DecisionTreeClassifier(criterion = "log_loss" , min_samples_leaf = 1, min_samples_split = 5)
tree_model.fit(X_train,y_train)
y_pred = tree_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.9391694969708096
              precision    recall  f1-score   support

           1       0.94      0.94      0.94     63756
           2       0.95      0.95      0.95     84864
           3       0.93      0.94      0.93     10718
           4       0.85      0.83      0.84       782
           5       0.84      0.82      0.83      2875
           6       0.89      0.88      0.88      5182
           7       0.95      0.94      0.95      6127

    accuracy                           0.94    174304
   macro avg       0.91      0.90      0.90    174304
weighted avg       0.94      0.94      0.94    174304

[[59969  3482     3     0    51     7   244]
 [ 3730 80362   232     0   346   145    49]
 [    3   192 10049    79    28   367     0]
 [    0     0    95   652     0    35     0]
 [   52   423    30     0  2357    12     1]
 [    8   173   400    36     9  4556     0]
 [  335    34     1     0     1     0  5756]]


In [16]:
forest_model = RandomForestClassifier(criterion = "entropy" , min_samples_leaf = 1, min_samples_split = 5, n_estimators= 150 )
forest_model.fit(X_train,y_train)
y_pred = forest_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.9533171929502479
              precision    recall  f1-score   support

           1       0.97      0.94      0.95     63756
           2       0.95      0.97      0.96     84864
           3       0.94      0.96      0.95     10718
           4       0.91      0.86      0.89       782
           5       0.94      0.77      0.85      2875
           6       0.93      0.89      0.91      5182
           7       0.97      0.94      0.96      6127

    accuracy                           0.95    174304
   macro avg       0.94      0.91      0.92    174304
weighted avg       0.95      0.95      0.95    174304

[[59901  3704     1     0    14     4   132]
 [ 1774 82664   182     1   112   103    28]
 [    2   152 10294    40    16   214     0]
 [    0     0    90   674     0    18     0]
 [   37   554    41     0  2227    16     0]
 [    6   163   362    23     3  4625     0]
 [  320    24     0     0     1     0  5782]]


# Thank you.