* Compile, Train and Save the models here

* 1. Preprocessing

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # CSV file
import config
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import davies_bouldin_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

In [None]:
def Scale_and_PCALDA(path):

    data = pd.read_csv(path)
    num_columns = data.shape[1]
    print(f"Num of Columns is {num_columns}")
    X = np.array(data.iloc[:,0:num_columns-1])
    y = np.array(data.iloc[:,num_columns-1])
    # print(len(X[0]))
    # print(y[0])
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # X shape: (n_samples, 12)

    ''' PCA '''
    n_components = 2
    pca_object = PCA(n_components= n_components)
    pca_object.fit(X_scaled)
    PrincipleComps = pca_object.transform(X_scaled)
    classes = np.unique(y)

    for i in range(n_components):
        plt.figure()
        for clss in classes:
            plt.hist(PrincipleComps[y == clss, i],
                    bins="auto", alpha=0.5, 
                    label=f"Class {clss}")
        plt.xlabel("Feature intervals")
        plt.ylabel("Frequency")
        plt.title(f"PCA by Class for feature column {i}")
        plt.legend()
        plt.grid(True)
        plt.show()
    score = davies_bouldin_score(PrincipleComps, y)
    print(f"The davies_bouldin_score for PCA is {score}")


    """ LDA """

    lda_mcc = LDA()
    lda_mcc.fit(X_scaled,y)
    lda_OP = lda_mcc.transform(X_scaled)
    plt.figure()
    for c in classes:
        plt.hist(lda_OP[y == c], bins=20, alpha=0.5, label=f"Class {c}")
    plt.xlabel("1D LDA Projection")
    plt.ylabel("Frequency")
    plt.title("LDA Projection onto First Component. 0 is cat, 1 is Dog")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:

print("Original DATA")
Scale_and_PCALDA(path = config.Features + 'data.csv' )

print("fs300_cc12")
Scale_and_PCALDA(path = config.Features + 'data_fs300_cc12.csv')

print("fs300_cc20")
Scale_and_PCALDA(path = config.Features + 'data_fs300_cc20.csv')

print("fs300_cc30")
Scale_and_PCALDA(path = config.Features + 'data_fs300_cc30.csv')

print("fs500_cc20")
Scale_and_PCALDA(path = config.Features + 'data_fs500_cc20.csv')

* 2. Training the model

* First approach- Vanilla NN 
* fs300_cc20 looks good. Let us see....................

In [None]:
data = pd.read_csv(config.Features + 'data_fs300_cc20.csv')
num_columns = data.shape[1]
X = np.array(data.iloc[:,0:num_columns-1])
y = np.array(data.iloc[:,num_columns-1])
# print(len(X[0]))
# print(y[0])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,        
    stratify=y,           
    random_state=42        
)



In [None]:

model = Sequential([
    Dense(64, input_shape=(20,), activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=32,
                    callbacks=[early_stop],
                    verbose=1)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


* Second approach- Vanilla NN and fs500_cc20 dataset

In [None]:
data = pd.read_csv(config.Features + 'data_fs500_cc20.csv')
num_columns = data.shape[1]
X = np.array(data.iloc[:,0:num_columns-1])
y = np.array(data.iloc[:,num_columns-1])
# print(len(X[0]))
# print(y[0])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,         
    stratify=y,            
    random_state=42       
)

In [None]:

# Define the model
model = Sequential([
    Dense(64, input_shape=(20,), activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=80,
                    batch_size=32,
                    callbacks=[early_stop],
                    verbose=1)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

* Approach 3 - 1D convolutional model and fs300cc12 dataset

In [None]:
data = pd.read_csv(config.Features + 'data_fs300_cc20.csv')
num_columns = data.shape[1]
X = np.array(data.iloc[:,0:num_columns-1])
y = np.array(data.iloc[:,num_columns-1])
# print(len(X[0]))
# print(y[0])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,         
    stratify=y,            
    random_state=42        
)

In [None]:
# Define input shape
num_mfcc = 20
num_frames = 50  # adjust depending on your data
input_shape = (num_frames, num_mfcc)

model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Conv1D(64, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
)


In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")
