In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [2]:
columns = [
    "Elevation", "Aspect", "Slope",
    "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
] + [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)] + ["Cover_Type"]

data = pd.read_csv("covtype.data.gz", header=None, names=columns)

In [3]:
X = data.drop("Cover_Type", axis=1)
y = data["Cover_Type"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("Class distribution before SMOTE:", Counter(y_train))

Class distribution before SMOTE: Counter({2: 198310, 1: 148288, 3: 25028, 7: 14357, 6: 12157, 5: 6645, 4: 1923})


In [5]:
#SMOTE (irrelevant for testing because SMOTE is a training optimization)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", Counter(y_train_res))

Class distribution after SMOTE: Counter({2: 198310, 1: 198310, 3: 198310, 4: 198310, 6: 198310, 7: 198310, 5: 198310})


In [6]:
#scale features for distance
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res) #this fits and transforms
X_test_scaled = scaler.transform(X_test) #this transforms but does not fit

In [7]:
# Convert back to DFs
preprocessed_train = pd.DataFrame(X_train_res_scaled, columns=X.columns)
preprocessed_train["Cover_Type"] = y_train_res.values

preprocessed_test = pd.DataFrame(X_test_scaled, columns=X.columns)
preprocessed_test["Cover_Type"] = y_test.values

In [8]:
preprocessed_train.to_csv("preprocessed_train.csv.gz", index=False, compression="gzip")
preprocessed_test.to_csv("preprocessed_test.csv.gz", index=False, compression="gzip")