# Train the model

In [None]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn_evaluation import plot

In [None]:
training_data = pd.read_csv("training-data/combined-unmasked-training.csv")
training_data = training_data.drop(columns=["x", "y", "spatial_ref"])
training_data.head()

In [None]:
train, test = train_test_split(training_data, test_size=0.2, random_state=42)

# Keep the dataframes as labelled dataframes, which makes things neater later
classes = train["cc_id"]
observations = train.drop(columns=["cc_id"])

# Sort the columns by name
observations = observations.reindex(sorted(observations.columns), axis=1)

# Create a model... emphasize class 4, which is seagrass
classifier = RandomForestClassifier(
    n_estimators=500,
    class_weight={1:1, 2:1, 3:1, 4:3, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1},
)

# ...and fit it to the data
model = classifier.fit(observations, classes)

In [None]:
# Evaluate the performance
y_pred = model.predict(test.drop(columns=["cc_id"]))
print(classification_report(test["cc_id"], y_pred))

In [None]:
ConfusionMatrixDisplay.from_estimator(model, test.drop(columns=["cc_id"]), test["cc_id"])

In [None]:
# Show the feature importances
plot.feature_importances(model, feature_names=observations.columns)

In [None]:
# Save the model
file_path = "models/20250902c-alex.model"
joblib.dump(model, file_path)