# A. Data description
###  input data 
<ul> 
<li>there are 1000 training examples</li>
<li>each of them is dataframe with time, mass , and intensity:: sorted by time then mass</li>
<li>dataframe lenghts are variable</li>
</ul>

### output binary clssification
<ul> 
<li>positive class probability</li>
</ul>


# B. Feature Extraction Steps
<ul>
<li>group mass values upto 1 decimal (0.0 to 650.0 , diff = 0.5)           /<br>-- total (1300) or multiply by 2  and take round() </li>
<li>take time values and fill remaining with (0.00 to 53.00 , diff = 0.02) /<br>-- total (2650) or multiply by 50 and take round() </li>
<li>create a Time X Mass matrix (2650,1300) with intensity values</li>

<li>noramlize intensity values --
    <ol> 
    <li>subtract mean</li>
    <li>minmax scale</li>
    </ol>   
</li>
<li>total 2650x1300 values = 3,445,000</li> 
</ul>

# C. Model architecture
<ul>
<li>feed this as convolutional layers (2,5 2d conv stride 2, 5 , that is comvolute these values)  . This will group similar time and mass values</li>
<li>feed this as pooling layers ( pooling layers with mean/ max pooling ) </li>
<li>further data reduction steps</li>
<li>then fully connected layers</li>
<li>sigmoid or softmax layer</li>
<li>output positive label probability</li>
</ul>


### 1. Loading data and libraries 

In [2]:
# for processing numbers and files csv
import pandas as pd
import numpy as np

# for visualization
# import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

# preprocessing 
# from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# model related
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# other
import copy
# from scipy.signal import find_peaks


In [3]:
metadata = pd.read_csv('metadata.csv').fillna(0)
train_labels = pd.read_csv('train_labels.csv')

In [4]:
metadata.head(2)

Unnamed: 0,sample_id,split,derivatized,features_path,features_md5_hash
0,S0000,train,0.0,train_features/S0000.csv,52ec6d6f8372500ab4e069b5fbdae6f9
1,S0001,train,0.0,train_features/S0001.csv,348f90baed8a8189bf0d4c7b9ed9f965


In [5]:
train_labels.head(2)

Unnamed: 0,sample_id,aromatic,hydrocarbon,carboxylic_acid,nitrogen_bearing_compound,chlorine_bearing_compound,sulfur_bearing_compound,alcohol,other_oxygen_bearing_compound,mineral
0,S0000,0,0,0,0,0,0,0,0,1
1,S0001,0,0,0,0,0,0,0,0,0


### 2. Preprocessing, Feature extraction

In [6]:
# constants
num_mass_values = 1350
num_time_values = 2650
matrix_shape = (num_time_values, num_mass_values)


In [7]:

# Function to create the Time X Mass matrix for a single data sample
def create_matrix(id ):
    
    sample_id, split, derivate, path, _ = metadata.iloc[id]
    data = pd.read_csv(path)

    label = None
    if (split == 'train'):
        label = train_labels.iloc[id].values[1:]

    # discretization of time and mass
    data.time = (data.time*50).round()
    data.mass = (data.mass*2).round()
    data.time = data.time.astype(int)
    data.mass = data.mass.astype(int)
    
    # scaling intensity
    data.intensity = MinMaxScaler().fit_transform(data.intensity.values.reshape(-1, 1))
    
    # filling matrix
    matrix = np.zeros(matrix_shape)
    matrix[data['time'],data['mass']] = data['intensity']

    return matrix , label


In [8]:
sample_matrix, sample_label= create_matrix(np.random.randint(1120)) ## 0.2 second # max = 1120
sample_matrix[0:100, 100:200],sample_label
# print(sample_matrix[500:1000,np.random.randint(0,100)])

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=object))

Dummy model

In [9]:

training_data = range(809)  # Load your training data


In [10]:
class DataGenerator(Sequence):
    
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
    def __len__(self):
        return len(self.data) // self.batch_size

    def __getitem__(self, idx):        
        
        low = idx * self.batch_size
        high = min(low + self.batch_size, len(self.data))
        batch_x = []
        batch_y = []

        for i,data_sample in enumerate(self.data[low:high]):
            processed_sample, processed_label = self.preprocess_data(data_sample)
            batch_x.append(processed_sample)
            batch_y.append(processed_label)
        return np.array(batch_x), np.array(batch_y)

    def preprocess_data(self, data_sample):
        # Perform the feature extraction steps and create the matrix
        matrix, label = create_matrix(data_sample)
        if type(label) == np.ndarray:
            label = np.array(label).astype(int)
        return matrix , label
    
    def getLabel(self, data_sample):
        matrix, label = create_matrix(data_sample)
        if type(label) == np.ndarray:
            label = np.array(label).astype(int)
        return label


In [11]:
train_data, val_data = train_test_split(training_data, test_size=0.2, random_state=42)
train_labels.iloc[[3,334,534]].values[:,1:].astype(int)

array([[0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
# train_data

a = DataGenerator(train_data,2 )
b = DataGenerator(val_data,2 )
# a.data[:5],len(b)

In [13]:

print(len(a))
# print(a[-2])
print(a[322])

323


(array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]]), array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0]]))


In [73]:
# Import necessary libraries
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# Create the model
model = Sequential()

model.add(Conv2D(4, kernel_size=(50,25), strides=(10,5), activation='relu',padding="valid", input_shape=(2650, 1350, 1)))
model.add(MaxPooling2D(pool_size=(25,25), strides=(10,10)))

# model.add(Conv2D(64, kernel_size=(5, 5), strides=(2, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(15, 7), strides=(12, 5)))


# Flatten the input
model.add(Flatten())

# Add fully connected layers
model.add(Dense(32, activation='relu'))

model.add(Dense(9, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])

# Split the training data into training and validation sets

# Define the batch size and number of epochs
batch_size = 8
num_epochs = 10

# Create instances of the custom data generator for training and validation
train_data_generator = DataGenerator(train_data, batch_size)
val_data_generator = DataGenerator(val_data, batch_size)

model.summary()

Model: "sequential_49"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_70 (Conv2D)          (None, 261, 266, 4)       5004      
                                                                 
 max_pooling2d_69 (MaxPooli  (None, 24, 25, 4)         0         
 ng2D)                                                           
                                                                 
 flatten_46 (Flatten)        (None, 2400)              0         
                                                                 
 dense_92 (Dense)            (None, 32)                76832     
                                                                 
 dense_93 (Dense)            (None, 9)                 297       
                                                                 
Total params: 82133 (320.83 KB)
Trainable params: 82133 (320.83 KB)
Non-trainable params: 0 (0.00 Byte)
_______________

In [74]:
from tensorflow.keras.callbacks import EarlyStopping
# Train the model using the data generators and perform validation  
es = EarlyStopping(monitor = 'loss', patience = 3 )

history = model.fit(train_data_generator, epochs=num_epochs, validation_data=val_data_generator, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [57]:
sample_matrix, sample_label= create_matrix(np.random.randint(1120)) ## 0.2 second # max = 1120
print(model.predict(sample_matrix.reshape(-1, 2650,1300,1)))
sample_label

ValueError: cannot reshape array of size 3577500 into shape (2650,1300,1)