In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter


In [2]:
preprocessed_train = pd.read_csv("preprocessed_train.csv.gz")
X_train = preprocessed_train.drop("Cover_Type", axis=1)
y_train = preprocessed_train["Cover_Type"]


In [3]:
preprocessed_test = pd.read_csv("preprocessed_test.csv.gz")
X_test = preprocessed_test.drop("Cover_Type", axis=1)
y_test = preprocessed_test["Cover_Type"]


In [4]:
X_train.head()


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,0.180063,-1.152978,1.522711,-0.314378,-0.077716,-0.426551,-0.473557,-2.353343,-1.027684,0.205501,...,-0.142691,-0.209188,-0.202152,-0.037851,-0.080474,-0.019854,-0.042082,-0.219848,-0.207335,-0.164284
1,1.373291,1.640701,-0.1629,1.867964,0.417458,-0.624808,-0.898336,-0.021423,0.819762,-0.313124,...,-0.142691,-0.209188,-0.202152,-0.037851,-0.080474,-0.019854,-0.042082,-0.219848,-0.207335,-0.164284
2,0.177677,1.741456,-0.283301,-0.572813,-0.506866,0.532443,-0.669609,-0.065421,0.602415,0.372945,...,-0.142691,-0.209188,-0.202152,-0.037851,-0.080474,-0.019854,-0.042082,-0.219848,-0.207335,-0.164284
3,0.218247,-0.026347,-1.126107,-1.089684,-0.836981,1.714475,0.539376,0.990543,0.211192,1.668149,...,-0.142691,-0.209188,-0.202152,-0.037851,-0.080474,-0.019854,-0.042082,-0.219848,-0.207335,-0.164284
4,1.012936,0.50491,0.559505,0.030202,0.483481,1.078401,-0.473557,1.562523,1.080578,-0.490524,...,-0.142691,-0.209188,4.946771,-0.037851,-0.080474,-0.019854,-0.042082,-0.219848,-0.207335,-0.164284


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Train Linear Regression Model
linear_model = LinearRegression(n_jobs=-1)
linear_model.fit(X_train, y_train)

print("Linear Regression model trained successfully!")
print(f"Training R² score: {linear_model.score(X_train, y_train):.4f}")


Linear Regression model trained successfully!
Training R² score: 0.3930


In [6]:
# Predict continuous values
y_pred_continuous = linear_model.predict(X_test)

# Round to nearest integer and clip to valid range [1, 7]
y_pred = np.clip(np.round(y_pred_continuous), 1, 7).astype(int)

# Show some predictions to see the rounding effect
print("Sample raw predictions (continuous):", y_pred_continuous[:10])
print("Sample rounded predictions (classes):", y_pred[:10])
print("Actual classes:                     ", y_test.values[:10])


Sample raw predictions (continuous): [2.87337049 2.37538272 2.82827765 3.47068764 2.51174296 1.49056414
 2.20247643 2.00585075 2.74904621 5.77019411]
Sample rounded predictions (classes): [3 2 3 3 3 1 2 2 3 6]
Actual classes:                      [2 2 1 1 1 1 1 2 1 7]


In [7]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n" + "="*50)
print("TEST SET VALIDATION RESULTS")
print("="*50)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("\n" + "="*50)
print("CLASSIFICATION REPORT")
print("="*50)
print(classification_report(y_test, y_pred))
print("\n" + "="*50)
print("CONFUSION MATRIX")
print("="*50)
print(confusion_matrix(y_test, y_pred))



TEST SET VALIDATION RESULTS
Accuracy:  0.2341
Precision: 0.5609
Recall:    0.2341
F1-Score:  0.2988

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           1       0.67      0.09      0.16     63552
           2       0.59      0.36      0.45     84991
           3       0.01      0.06      0.02     10726
           4       0.02      0.83      0.04       824
           5       0.03      0.11      0.04      2848
           6       0.00      0.00      0.00      5210
           7       0.81      0.36      0.50      6153

    accuracy                           0.23    174304
   macro avg       0.30      0.26      0.17    174304
weighted avg       0.56      0.23      0.30    174304


CONFUSION MATRIX
[[ 5934 21285 25288  4054  2765  3776   450]
 [ 2896 30994 32645 13970  4091   328    67]
 [    0     1   682  7820  2221     1     1]
 [    0     0    77   680    67     0     0]
 [    3   235   813  1495   302     0     0]
 [    0    15   387  2681  2125     