## 1. Data Description 
<ul> 
<li>there are 809 training examples</li>
<li>each of them is dataframe with time, mass , and intensity</li>
<li>dataframe lenghts are variable</li>
</ul>


In [None]:
# for processing numbers and files csv
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.utils import class_weight


# model related
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

import tensorflow as tf


In [None]:
kag_path_in = "/kaggle/input/mars-spec-train/"
kag_path_out = "/kaggle/working/"
kag_path_in = ""
kag_path_out = ""

In [None]:
# metadata = pd.read_csv('/kaggle/input/mars-spec-train/metadata.csv').fillna(0)
# train_labels = pd.read_csv('/kaggle/input/mars-spec-train/train_labels.csv')
metadata = pd.read_csv(kag_path_in+'metadata.csv').fillna(0)
train_labels = pd.read_csv(kag_path_in+'train_labels.csv')

## 2. Preprocessing and Feature extraction
<ul>
<li>mass values are grouped into values seperated by 0.50 </li>
<li>time values are grouped into values seperated by 0.02 </li>
<li>Time * Mass matrix (2650,1350) with Intensity values </li>
</ul>

In [None]:
# constants
num_mass_values = 1350
num_time_values = 2650
matrix_shape = (num_time_values, num_mass_values)

In [None]:
# Function to create the Time X Mass matrix and label for a single data sample

def create_matrix(id, lab = None ):
    
    sample_id, split, derivate, path, _ = metadata.iloc[id]
    path = kag_path_in+path
    # path = '/kaggle/input/mars-spec-train/'+ path
    data = pd.read_csv(path)

    label = None
    if (split == 'train'):
        label = train_labels.iloc[id].values[lab]
    data.time = (data.time*50).round()
    data.mass = (data.mass*2).round()
    data.time = data.time.astype(int)
    data.mass = data.mass.astype(int)
    
    data.intensity = MinMaxScaler().fit_transform(data.intensity.values.reshape(-1, 1))
    
    matrix = np.zeros(matrix_shape)
    matrix[data['time'],data['mass']] = data['intensity']

    return matrix , label


## 3. Data Generation
<ul>
<li>Storing data matrix is costly</li>
<li>data is generated in batches</li>
</ul>


In [None]:
training_data = range(809)  # Load your training data
train_data, val_data = train_test_split(training_data, test_size=0.2, random_state=28)


In [None]:
class DataGenerator(Sequence):
    
    def __init__(self, data, batch_size, lab):
        self.data = data
        self.batch_size = batch_size
        self.lab = lab
    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):        
        
        low = idx * self.batch_size
        high = min(low + self.batch_size, len(self.data))
        batch_x = []
        batch_y = []

        for data_sample in self.data[low:high]:
            processed_sample, processed_label = self.preprocess_data(data_sample)
            batch_x.append(processed_sample)
            batch_y.append(processed_label)
        return np.array(batch_x), np.array(batch_y)

    def preprocess_data(self, data_sample):
        matrix, label = create_matrix(data_sample, self.lab)
        return matrix , label

## 4. Models
<ul>
<li>9 seperate model for each class</li>
<li>each model have balanced class weights</li>

<li>convolutional layers </li>
<li>pooling layers (max pooling) </li>
<li>fully connected layers</li>
<li>sigmoid layer</li>
<li>output positive label probability</li>
</ul>

In [None]:
label_names = [col for col in train_labels.columns[1:]]

### Aromatic (model 1)  

In [None]:
# contants
label_number = 1
batch_size = 16
num_epochs = 32
patience = 5
model_1_saved = 0
model_1 = None
history_1 = None

In [None]:
if model_1_saved == 0:
    # model params
    model_1 = Sequential()
    model_1.add(Conv2D(2, kernel_size=(50, 50),strides = (2,1), activation='relu', input_shape=(num_time_values, num_mass_values, 1)))
    model_1.add(MaxPooling2D(pool_size=(40, 20)))
    model_1.add(Flatten())
    model_1.add(Dense(8, activation='relu'))
    model_1.add(Dense(1, activation='sigmoid'))
    model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=["AUC"])
    model_1.summary()

    # generation
    train_data_generator = DataGenerator(train_data, batch_size, label_number)
    val_data_generator = DataGenerator(val_data, batch_size, label_number)
else:
    model_1 = tf.keras.models.load_model('model_{}.keras'.format(label_number))

In [None]:
if (model_1_saved == 0):
    es = EarlyStopping(monitor = 'val_loss', patience = patience , restore_best_weights=True )
    cw = {0:1,1:1}
    cw[0],cw[1]= len(train_labels)/(2*train_labels.iloc[:,label_number].value_counts())
    history_model_1 = model_1.fit(train_data_generator, epochs=num_epochs, validation_data = val_data_generator,callbacks=[es], verbose = 1, class_weight = cw)
    model_1.save('model_{}.keras'.format(label_number))
    m1_saved = 1

In [None]:
plt.figure(figsize=(5,5))
plt.plot(history_model_1.history['loss'])
plt.plot(history_model_1.history['val_loss'])
plt.legend(['train', 'val'])
plt.title(f'model loss {label_number}')
plt.show()

In [None]:
# for i in range (10):
#     sample_matrix, sample_label= create_matrix(np.random.randint(809)) ## 0.2 second # max = 808
#     print (f"{i+1}th prediction","prediction : ",
#            (model_1.predict(sample_matrix.reshape(-1, 2650,1350,1),verbose = 0)).squeeze(),
#            " true label : ",np.array(sample_label))

## 5. Predictions
<ul>
<li>prediction of total 312 samples</li>
<li>merging predictions from all model</li>
<li>summerizing each model loss</li>
</ul>