<a href="https://colab.research.google.com/github/chapmangj/Protolith-classification/blob/main/notebooks/train_protolith_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Protolith classification model training script.

Model uses major element geochemistry and a balanced random forrest algorithm to discriminate sedimentary from ignous protoliths.

Scripts to create and train final model pipeline on complete dataset.


In [122]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

from imblearn.ensemble import BalancedRandomForestClassifier

import joblib

from datetime import datetime
from pathlib import Path

In [123]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [124]:
import sklearn
import imblearn
print(f"scikit-learn version: {sklearn.__version__}")
print(f"imbalanced-learn version: {imblearn.__version__}")

scikit-learn version: 1.6.1
imbalanced-learn version: 0.14.0


## Enter Paths to training dataset and model output location

In [125]:
#Path to complete training dataset
in_path = r'/content/classifier_data_2019-02-26_ilr_training.csv'

#Path to location to save model
out_path = r'/content/drive/MyDrive/protolith'

## Training functions

In [126]:
def load_training_data(path: str) -> pd.DataFrame:
    """
    Function to load the protolith model training data from file. Data is already in Atchison simplex form.
    Parameters:
        path: path to training data as string
    returns:
        training dataset X_train, y_train
    """
    p = Path(path)
    try:
        if p.exists and p.suffix == '.csv':
            df = pd.read_csv(p, encoding='latin-1') #training set
            print("Columns in the loaded DataFrame:", df.columns.tolist()) # Print column names
        else:
             raise FileNotFoundError("File does not exist or is not a CSV.")
    except Exception as e:
        print(f'could not read training data file: {e}')
        raise e # Re-raise the exception after printing

    #create raw X training set and binary labeled y array
    X_train, y_train = df[['sio2','tio2','al2o3','feo_tot','mgo','cao','na2o','k2o','p2o5']], df[['rock_group']].replace(['igneous','sedimentary'],[0,1])

    return X_train, y_train

def train_model(X: pd.DataFrame, y: pd.Series) -> sklearn.pipeline.Pipeline:
    """
    Function to train the protolith classification model.
    """
    cv = StratifiedKFold(n_splits=5, random_state=101, shuffle=True)

    clsf_pipe = Pipeline([
        ('sc', StandardScaler()),
        ('classifier', CalibratedClassifierCV(
            estimator=BalancedRandomForestClassifier(  # Changed from base_estimator to estimator
                n_estimators=50,
                max_depth=15,
                min_samples_leaf=1,
                min_samples_split=2,
                max_features='sqrt',
                sampling_strategy='not minority',
                n_jobs=-1,
                random_state=101
            ),
            method='sigmoid',
            cv=cv
        ))
    ])

    model_pipe = clsf_pipe.fit(X, y)
    return model_pipe

def save_model(model, name: str, out_path: str):
    """
    Function to save trained protolith classification model.
    Paramaters:
        model: trained sklearn model pipe
        name: str. File name (will be appended with date and time)
        out_path: str. path to export file to
    returns
        joblib file
    """
    now = datetime.now().strftime("%Y-%M-%d-%H-%M")
    file_name = f'{name}_{now}.joblib'
    p = Path(out_path) / file_name
    joblib.dump(model, p)

## Script to load data, train model and output saved model

In [None]:
X_train, y_train = load_training_data(in_path)

model = train_model(X_train, y_train)

save_model(model, 'Model50_15_full', out_path)

  df = pd.read_csv(p, encoding='latin-1') #training set


Columns in the loaded DataFrame: ['rock_group', 'rock_type', 'sample_id', 'author', 'title', 'journal', 'year', 'doi', 'bibtex', 'sio2', 'tio2', 'al2o3', 'feo_tot', 'mgo', 'cao', 'na2o', 'k2o', 'p2o5', 'index', 'coord_1', 'coord_2', 'coord_3', 'coord_4', 'coord_5', 'coord_6', 'coord_7', 'coord_8']


  X_train, y_train = df[['sio2','tio2','al2o3','feo_tot','mgo','cao','na2o','k2o','p2o5']], df[['rock_group']].replace(['igneous','sedimentary'],[0,1])
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
