In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import pandas as pd
from lib.full_flow_dataloader import load_full_flow_data
from lib.reproduction import major_oxides
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
def split_and_preprocess_data(train: pd.DataFrame, test: pd.DataFrame, preprocesser_pipeline: Pipeline):
    drop_cols = major_oxides + ["ID", "Sample Name"]

    # Split data
    X_train = train.drop(columns=drop_cols)
    X_test = test.drop(columns=drop_cols)
    y_train = train[major_oxides]
    y_test = test[major_oxides]

    # Preprocess data
    X_train = preprocesser_pipeline.fit_transform(X_train)
    X_test = preprocesser_pipeline.transform(X_test)

    return X_train, y_train, X_test, y_test

In [None]:
# Possible idea: store preprocess pipeline and related model pipeline in hashmap so that you could give it a key of the 
# target and apply the corresponding pipeline

preprocessor_pipeline = Pipeline([
    ("scaler", MaxAbsScaler()),
    ("transformer", PowerTransformer())
])

In [None]:
train, test = load_full_flow_data()

X_train, y_train, X_test, y_test = split_and_preprocess_data(train, test, preprocessor_pipeline)

In [None]:
# Instantiating models

# ---- SVR ----
kernel="poly"
C=100
eps=0.1
gamma="scale"
degree=2
coef0=1.0

svr = SVR(kernel=kernel, C=C, epsilon=eps, gamma=gamma, degree=degree, coef0=coef0)

# ---- ExtraTreesRegressor ----
n_estimators = 100
max_depth = None
min_samples_split = 2
min_samples_leaf = 1
max_features = 'auto'

etr = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                          max_features=max_features, random_state=42)


In [None]:
# Constructing pipelines for each oxide
sio2_pipeline = Pipeline([('SVR', svr), ('ETR', etr)])
tio2_pipeline = Pipeline([('SVR', svr)])
al2o3_pipeline = Pipeline([('SVR', svr)])
feot_pipeline = Pipeline([('SVR', svr)])
mgO_pipeline = Pipeline([('SVR', svr)])
cao_pipeline = Pipeline([('SVR', svr)])
na2O_pipeline = Pipeline([('SVR', svr)])
k2O_pipeline = Pipeline([('SVR', svr)])

In [None]:
# target: 'SiO2, 'TiO2', 'Al2O3', 'FeOT', 'MgO', 'CaO', 'Na2O', 'K2O'

for target in y_train.columns:
    print(f"Training model for {target}")