## 1. Data Description 
<ul> 
<li>there are 809 training examples</li>
<li>each of them is dataframe with time, mass , and intensity</li>
<li>dataframe lenghts are variable</li>
</ul>


In [2]:
# for processing numbers and files csv
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.utils import class_weight


# model related
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

import tensorflow as tf


In [3]:
kag_path_in = "/kaggle/input/mars-spec-train/"
kag_path_out = "/kaggle/working/"
kag_path_in = ""
kag_path_out = ""

In [4]:
# metadata = pd.read_csv('/kaggle/input/mars-spec-train/metadata.csv').fillna(0)
# train_labels = pd.read_csv('/kaggle/input/mars-spec-train/train_labels.csv')
metadata = pd.read_csv(kag_path_in+'metadata.csv').fillna(0)
train_labels = pd.read_csv(kag_path_in+'train_labels.csv')

## 2. Preprocessing and Feature extraction
<ul>
<li>mass values are grouped into values seperated by 0.50 </li>
<li>time values are grouped into values seperated by 0.02 </li>
<li>Time * Mass matrix (2650,1350) with Intensity values </li>
</ul>

In [5]:
# constants
num_mass_values = 1350
num_time_values = 2650
matrix_shape = (num_time_values, num_mass_values)

In [6]:

# Function to create the Time X Mass matrix for a single data sample
def create_matrix(id ):
    
    sample_id, split, derivate, path, _ = metadata.iloc[id]
    data = pd.read_csv(path)

    label = None
    if (split == 'train'):
        label = train_labels.iloc[id].values[1:]

    # discretization of time and mass
    data.time = (data.time*50).round()
    data.mass = (data.mass*2).round()
    data.time = data.time.astype(int)
    data.mass = data.mass.astype(int)
    
    # scaling intensity
    data.intensity = MinMaxScaler().fit_transform(data.intensity.values.reshape(-1, 1))
    
    # filling matrix
    matrix = np.zeros(matrix_shape)
    matrix[data['time'],data['mass']] = data['intensity']

    return matrix , label


## 3. Data Generation
<ul>
<li>Storing data matrix is costly</li>
<li>data is generated in batches</li>
</ul>


In [7]:
training_data = range(809)  # Load your training data
train_data, val_data = train_test_split(training_data, test_size=0.2, random_state=28)


In [8]:
class DataGenerator(Sequence):
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):        
        
        low = idx * self.batch_size
        high = min(low + self.batch_size, len(self.data))
        batch_x = []
        batch_y = []

        for i,data_sample in enumerate(self.data[low:high]):
            processed_sample, processed_label = self.preprocess_data(data_sample)
            batch_x.append(processed_sample)
            batch_y.append(processed_label)
        return np.array(batch_x), np.array(batch_y)

    def preprocess_data(self, data_sample):
        # Perform the feature extraction steps and create the matrix
        matrix, label = create_matrix(data_sample)
        if type(label) == np.ndarray:
            label = np.array(label).astype(int)
        return matrix , label


In [10]:
a = DataGenerator([2,1,32,13,132,122,90,91],2)
a[1]

(array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]]),
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1]]))

## 4. Models
<ul>
<li>convolutional layers </li>
<li>pooling layers (max pooling) </li>
<li>fully connected layers</li>
<li>sigmoid layer</li>
<li>output positive label probability</li>
</ul>

In [11]:
label_names = [col for col in train_labels.columns[1:]]

### Aromatic (model 1)  

In [12]:
# contants
label_number = 1
batch_size = 16
num_epochs = 32
patience = 5
model = None
history = None

In [20]:

model = Sequential()

model.add(Conv2D(4, kernel_size=(5, 4), activation='relu', input_shape=(2650, 1350, 1)))
model.add(MaxPooling2D(pool_size=(40, 5)))

model.add(Flatten())

model.add(Dense(16, activation='relu'))

model.add(Dense(9, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create instances of the custom data generator for training and validation
train_data_generator = DataGenerator(train_data, batch_size)
val_data_generator = DataGenerator(val_data, batch_size)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_5 (Conv2D)           (None, 2646, 1347, 4)     84        
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 66, 269, 4)        0         
 g2D)                                                            
                                                                 
 flatten_5 (Flatten)         (None, 71016)             0         
                                                                 
 dense_10 (Dense)            (None, 16)                1136272   
                                                                 
 dense_11 (Dense)            (None, 9)                 153       
                                                                 
Total params: 1136509 (4.34 MB)
Trainable params: 1136509 (4.34 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [15]:
es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True  )
history = model.fit(train_data_generator, epochs=num_epochs, validation_data=val_data_generator, callbacks=[es],verbose = 2)

In [1]:
tf.keras.models.save_model("model.keras")

NameError: name 'tf' is not defined

## 5. Predictions
<ul>
<li>prediction of total 312 samples</li>
<li>merging predictions from all model</li>
<li>summerizing each model loss</li>
</ul>