### 1. Loading data and libraries 

In [None]:
# for processing numbers and files csv
import pandas as pd
import numpy as np

# for visualization
# import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

# preprocessing 
# from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# model related
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

import tensorflow as tf
# other
import copy
# from scipy.signal import find_peaks


# A. Data description
###  input data 
<ul> 
<li>there are 1000 training examples</li>
<li>each of them is dataframe with time, mass , and intensity:: sorted by time then mass</li>
<li>dataframe lenghts are variable</li>
</ul>

### output binary clssification
<ul> 
<li>positive class probability</li>
</ul>


# B. Feature Extraction Steps
<ul>
<li>group mass values upto 1 decimal (0.0 to 650.0 , diff = 0.5)           /<br>-- total (1350) or multiply by 2  and take round() </li>
<li>take time values and fill remaining with (0.00 to 53.00 , diff = 0.02) /<br>-- total (2650) or multiply by 50 and take round() </li>
<li>create a Time X Mass matrix (2650,1300) with intensity values</li>

<li>noramlize intensity values (minMaxScale)
</li>
<li>total 2650x1300 values = 3,445,000</li> 
</ul>

# C. Model architecture
<ul>
<li>feed this as convolutional layers. This will group similar time and mass values</li>
<li>feed this to max pooling layers ( for capturing data peaks ) </li>
<li>further data reduction steps</li>
<li>then fully connected layers</li>
<li>sigmoid activation layer </li>
<li>output positive label probability</li>
</ul>


In [None]:
metadata = pd.read_csv('metadata.csv').fillna(0)
train_labels = pd.read_csv('train_labels.csv')

In [None]:
metadata.head(2)

In [None]:
train_labels.head(2)

### 2. Preprocessing, Feature extraction

In [None]:
# constants
num_mass_values = 1350
num_time_values = 2650
matrix_shape = (num_time_values, num_mass_values)


In [None]:

# Function to create the Time X Mass matrix for a single data sample
def create_matrix(id ):
    
    sample_id, split, derivate, path, _ = metadata.iloc[id]
    data = pd.read_csv(path)

    label = None
    if (split == 'train'):
        label = train_labels.iloc[id].values[1:]

    # discretization of time and mass
    data.time = (data.time*50).round()
    data.mass = (data.mass*2).round()
    data.time = data.time.astype(int)
    data.mass = data.mass.astype(int)
    
    # scaling intensity
    data.intensity = MinMaxScaler().fit_transform(data.intensity.values.reshape(-1, 1))
    
    # filling matrix
    matrix = np.zeros(matrix_shape)
    matrix[data['time'],data['mass']] = data['intensity']

    return matrix , label


In [None]:
sample_matrix, sample_label= create_matrix(np.random.randint(809)) ## 0.2 second # max = 1120
sample_matrix[0:100, 100:200],sample_label
# print(sample_matrix[500:1000,np.random.randint(0,100)])

## 3. Model Architecture

In [None]:
training_data = range(809)  # Load your training data


In [None]:
class DataGenerator(Sequence):
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):        
        
        low = idx * self.batch_size
        high = min(low + self.batch_size, len(self.data))
        batch_x = []
        batch_y = []

        for i,data_sample in enumerate(self.data[low:high]):
            processed_sample, processed_label = self.preprocess_data(data_sample)
            batch_x.append(processed_sample)
            batch_y.append(processed_label)
        return np.array(batch_x), np.array(batch_y)

    def preprocess_data(self, data_sample):
        # Perform the feature extraction steps and create the matrix
        matrix, label = create_matrix(data_sample)
        if type(label) == np.ndarray:
            label = np.array(label).astype(int)
        return matrix , label


In [None]:
train_data, val_data = train_test_split(training_data, test_size=0.2, random_state=42)
train_labels.iloc[[3,334,534]].values[:,1:].astype(int)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

model = Sequential()

model.add(Conv2D(2, kernel_size=(5, 4),strides = (4,2), activation='relu', input_shape=(2650, 1350, 1)))
model.add(MaxPooling2D(pool_size=(100, 2), strides=(40,1)))

model.add(Flatten())

model.add(Dense(16, activation='relu'))

model.add(Dense(9, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the batch size and number of epochs
batch_size = 16
num_epochs = 32

# Create instances of the custom data generator for training and validation
train_data_generator = DataGenerator(train_data, batch_size)
val_data_generator = DataGenerator(val_data, batch_size)

model.summary()

## 4. Train & Test

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True  )

history = model.fit(train_data_generator, epochs=num_epochs, validation_data=val_data_generator, callbacks=[es])

Epoch 2/32


In [None]:
# saving model
model.save('combined.keras')

Evaluations

In [None]:
train_loss = 0
train_lab = []
train_pred = []
from sklearn.metrics import log_loss
for i,data in enumerate(train_data):
    mat , lab = create_matrix(data)
    lab = np.array(lab).astype(int).reshape(1,9)
    pred = model.predict(mat.reshape(-1,2650,1350,1), verbose = 0)
    train_lab.append(lab)
    train_pred.append(pred)
    if (i%100 == 0):
        print (i)

In [None]:

train_lab = np.array(train_lab).reshape(-1,9)
train_pred = np.array(train_pred).reshape(-1,9)
print(f"train log loss : {log_loss(train_lab, train_pred)}")

In [None]:
val_loss = 0
val_lab = []
val_pred = []
from sklearn.metrics import log_loss
for i,data in enumerate(val_data):
    mat , lab = create_matrix(data)
    lab = np.array(lab).astype(int).reshape(1,9)
    pred = model.predict(mat.reshape(-1,2650,1350,1), verbose = 0)
    val_lab.append(lab)
    val_pred.append(pred)
    if (i%100 == 0):
        print (i)

In [None]:
val_lab = np.array(val_lab).reshape(-1,9)
val_pred = np.array(val_pred).reshape(-1,9)

In [None]:
print(f"valid log loss : {log_loss(val_lab, val_pred).mean()}")

train log loss : 1.0316461173431106 \
valid log loss : 1.0560311513842007 \
sample predictions below

In [None]:
for i in range (10):
    sample_matrix, sample_label= create_matrix(np.random.randint(809)) ## 0.2 second # max = 1120
    print (f"{i+1}th prediction")
    print("prediction : ",(model.predict(sample_matrix.reshape(-1, 2650,1350,1),verbose = 0).round().astype(int)).squeeze())
    print("true label : ",np.array(sample_label))