# CMLRE Marine Data Analysis — Training Notebook
This notebook trains a RandomForestClassifier and saves `model.pkl`. Replace the CSV with your real dataset and re-run.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv('cmlre_biodiversity_data_sample.csv')
df.head()


In [2]:
# Simple preprocessing
X = pd.get_dummies(df[['species','count','length_cm']], columns=['species'], drop_first=True)
y = pd.cut(df['length_cm'], bins=[-1,15,25,100], labels=[0,1,2]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
joblib.dump({'model': model, 'columns': list(X.columns)}, 'model.pkl')
print('Saved model.pkl')


### How to use the saved model in Python
```python
import joblib
import pandas as pd
m = joblib.load('model.pkl')
model = m['model']
cols = m['columns']
# Build input dataframe with these columns then model.predict(df)
```