In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

data=pd.read_csv('covtype.data')
print(data.head())

   2596   51   3  258    0   510  221  232  148  6279  ...  0.34  0.35  0.36  \
0  2590   56   2  212   -6   390  220  235  151  6225  ...     0     0     0   
1  2804  139   9  268   65  3180  234  238  135  6121  ...     0     0     0   
2  2785  155  18  242  118  3090  238  238  122  6211  ...     0     0     0   
3  2595   45   2  153   -1   391  220  234  150  6172  ...     0     0     0   
4  2579  132   6  300  -15    67  230  237  140  6031  ...     0     0     0   

   0.37  0.38  0.39  0.40  0.41  0.42  5  
0     0     0     0     0     0     0  5  
1     0     0     0     0     0     0  2  
2     0     0     0     0     0     0  2  
3     0     0     0     0     0     0  5  
4     0     0     0     0     0     0  2  

[5 rows x 55 columns]


In [3]:
#adding columns
# Define the column names
cols = [
    'Elevation', 'Aspect', 'Slope',
    'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
    'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'
]

# Add binary columns for Wilderness_Area (4 columns)
cols += [f'Wilderness_Area_{i}' for i in range(4)]

# Add binary columns for Soil_Type (40 columns)
cols += [f'Soil_Type_{i}' for i in range(40)]

# Add the target variable
cols.append('Cover_Type')
data = pd.read_csv('covtype.data', header=None, names=cols)

# Now check the head again
print(data.head())

   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0       2596      51      3                               258   
1       2590      56      2                               212   
2       2804     139      9                               268   
3       2785     155     18                               242   
4       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0            221             232            148   
1            220             235            151   
2            234             238   

In [4]:
for i in range(40):                #checkimg if standard deviation is 0 in any feild so we can drop useless
  print(data[f'Soil_Type_{i}'].std())

0.07203855962387314
0.11306555332495184
0.09073114340132485
0.1444992500947708
0.05235553437275834
0.10577532120169254
0.013441979413645559
0.017549608330626867
0.04438747374806503
0.23024512189619914
0.14457904566141022
0.22118610713758266
0.17059040277907075
0.03209203434127658
0.002272308686918377
0.06980444151674238
0.0765182637225768
0.057076772986421095
0.08290234734021186
0.12522802739079178
0.03795040474102824
0.2326805491523966
0.2991972202713802
0.18783283689812347
0.028550897741019286
0.06660457685361668
0.04319330879380841
0.040318052398237984
0.398761766442427
0.22187912770254212
0.20548313855478378
0.28674319188827785
0.26772457957475143
0.052583884085586986
0.05695681459945466
0.014309907410240248
0.02264146643453979
0.16150809446237013
0.15230691241928612
0.12179143630018043


In [5]:
data['Cover_Type'] = data['Cover_Type'] - 1

print(f"New unique labels: {data['Cover_Type'].unique()}")

New unique labels: [4 1 0 6 2 5 3]


In [6]:
binary_cols = [col for col in data.columns if 'Wilderness_Area' in col or 'Soil_Type' in col]
# Downcast to int8 (takes only 1 byte per entry instead of 8) for less memory usage
data[binary_cols] = data[binary_cols].astype('int8')

In [7]:
import numpy as np

# Creating the interaction feature
data['Distance_To_Hydrology_Euclidean'] = np.sqrt(
    data['Horizontal_Distance_To_Hydrology']**2 +
    data['Vertical_Distance_To_Hydrology']**2
)

print(data[['Distance_To_Hydrology_Euclidean']].head())

   Distance_To_Hydrology_Euclidean
0                       258.000000
1                       212.084889
2                       275.769832
3                       269.235956
4                       153.003268


In [8]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']

# Stratified split to maintain class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training shapes: {X_train.shape}, {y_train.shape}")
print(f"Testing shapes: {X_test.shape}, {y_test.shape}")

Training shapes: (464809, 55), (464809,)
Testing shapes: (116203, 55), (116203,)


In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
# max_depth=20 prevents the trees from becoming infinitely complex/overfitting
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    n_jobs=-1,
    random_state=42,
    verbose=1 # This shows progress during training
)

# Train the model
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished


In [11]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
y_pred=rf_model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Overall Accuracy: {accuracy:.4f}")
# Detailed report (Precision, Recall, F1-Score for each of the 7 classes)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    7.9s finished


Overall Accuracy: 0.8914

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88     42368
           1       0.87      0.94      0.90     56661
           2       0.92      0.93      0.92      7151
           3       0.91      0.84      0.87       549
           4       0.97      0.39      0.56      1899
           5       0.91      0.79      0.85      3473
           6       0.98      0.87      0.92      4102

    accuracy                           0.89    116203
   macro avg       0.92      0.80      0.84    116203
weighted avg       0.89      0.89      0.89    116203



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Initialize with class_weight='balanced'
rf_balanced = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    class_weight='balanced', # This gives more "importance" to the rare classes
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 2. Train the new model
rf_balanced.fit(X_train, y_train)

# 3. Evaluate
y_pred_balanced = rf_balanced.predict(X_test)

print("--- Balanced Model Report ---")
print(classification_report(y_test, y_pred_balanced))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s


--- Balanced Model Report ---
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     42368
           1       0.93      0.89      0.91     56661
           2       0.90      0.93      0.92      7151
           3       0.83      0.88      0.85       549
           4       0.52      0.93      0.67      1899
           5       0.76      0.91      0.83      3473
           6       0.92      0.97      0.94      4102

    accuracy                           0.90    116203
   macro avg       0.82      0.92      0.86    116203
weighted avg       0.91      0.90      0.90    116203



[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.6s finished
