In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
from lib.reproduction import major_oxides
from sklearn.metrics import mean_squared_error
from lib import full_flow_dataloader
import mlflow
import mlflow.keras
import numpy as np
import datetime
import os
from pathlib import Path

os.environ["KERAS_BACKEND"] = "torch"

import torch
import keras


torch.manual_seed(42)
np.random.seed(42)

In [45]:

import torch.nn as nn
import torch.optim as optim

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [52]:
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader

edr_path = Path("/home/christian/projects/p9/baseline/data/calib_edr/calib_2015")

class CSVDataset(Dataset):
    def __init__(self, directory_path, composition_data):
        self.csv_files = []
        self.targets = []
        for directory in directory_path.iterdir():
            if directory.is_dir():
                files = [f for f in directory.glob('*.csv')]
                # Assuming get_composition_for_sample returns a DataFrame
                target = composition_data.get_composition_for_sample(directory.name)[major_oxides].values.flatten().astype(np.float32)
                if target.size > 0:  # Check if target array is not empty
                    self.csv_files.extend(files)
                    # Extend the targets list with the same target for each file in the directory
                    self.targets.extend([target] * len(files))

    def __len__(self):
        return len(self.csv_files)

    def __getitem__(self, idx):
        file_path = self.csv_files[idx]
        data = np.loadtxt(file_path, skiprows=29, delimiter=',')
        target = self.targets[idx]
        return torch.tensor(data, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

# Usage example:
# Assuming 'composition_data' is an instance of CompositionData
# dataset = CSVDataset(edr_path, composition_data)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# for data, targets in dataloader:
#     # Here you would integrate with your PyTorch model training loop
#     optimizer.zero_grad()
#     outputs = model(data)
#     loss = loss_function(outputs, targets)
#     loss.backward()
#     optimizer.step()


In [53]:
from lib.config import AppConfig
from lib.data_handling import CompositionData

config = AppConfig()

composition_data = CompositionData(config.composition_data_path)
dataset = CSVDataset(edr_path, composition_data)

In [54]:
X, y = zip(*[dataset[i] for i in range(len(dataset))])

X.shape, y.shape

In [49]:
from keras.models import Model
from keras.layers import Input, Conv1D, BatchNormalization, ReLU, Subtract, Dense, Flatten

D=20

def create_spectral_cnn(input_shape, num_chemical_elements):
    # Module 1
    input_layer = Input(shape=input_shape)
    x = input_layer
    for _ in range(D):  # Assuming D is the number of convolutional layers in Module 1
        x = Conv1D(filters=64, kernel_size=3, padding='same')(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)

    # Module 2 (Residual subtraction)
    # Note: Module 2 could be more complex than a single Conv layer, but this is not clear from the annotations.
    residual = Conv1D(filters=64, kernel_size=3, padding='same')(x)
    x = Subtract()([x, residual])

    # Flattening the output of Module 2 to feed into a dense network
    x = Flatten()(x)

    # Module 3 (Regression for chemical content)
    outputs = []
    for _ in range(num_chemical_elements):
        outputs.append(Dense(units=1)(x))

    model = Model(inputs=input_layer, outputs=outputs)
    return model

# Assuming we have 5500 features in the input and we want to predict 8 chemical elements
input_shape = (5500, 1)  # Features, Channels
num_chemical_elements = 8
model = create_spectral_cnn(input_shape, num_chemical_elements)

model.compile(optimizer='adam', loss='mse')  # Using mean squared error for regression tasks
model.summary()

In [51]:
from keras.callbacks import EarlyStopping

mlflow.keras.autolog()

# Assuming 'dataset' is your CSVDataset and has been suitably converted to a format Keras can handle
# For demonstration, let's assume you have X_train, y_train prepared as NumPy arrays

# Define callbacks for early stopping and model checkpointing
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min'),
]

# Train the model
history = model.fit(
    dataset,
    # validation_split=0.2,  # using 20% of data for validation
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
)

# Load the best model saved by ModelCheckpoint
model.load_weights('best_model.h5')

# Optionally, evaluate the model performance on a test set
# test_loss = model.evaluate(X_test, y_test)
# print(f"Test Loss: {test_loss}")



2024/04/22 08:01:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '42cf5404a14946baa509161e5d786cf3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current keras workflow
  self._warn_if_super_not_called()


Epoch 1/50


ValueError: Input 0 of layer "functional_5" is incompatible with the layer: expected shape=(None, 5500, 1), found shape=(6444, 50)