In [1]:
import pickle
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from sound_utils import *
from data_utils import *
from model import *
from model_utils import ModelUtils
import pandas as pd

##############################################################
##############################################################

# SEED = 13 # reproducible results: Same results in every run
# IN_PATH = ''
# DATA_PATH = '' 
# OUT_PATH = ''
# EPOCH = 20 # number of epochs to run for model

# np.random.seed(SEED) 
# torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True  # cuda algorithms
# os.environ['PYTHONHASHSEED'] = str(SEED)

#plt.style.use('seaborn') # use seaborn style plotting

##############################################################
##############################################################


# Example usage:
wav_folder = '../data/cropped'
frame_length = 44100  # Example: 1 second at 44.1 kHz sampling rate
max_lentgh = 9500000
data_df = pd.read_csv('dummy_dataset.csv')
num_classes = data_df['region'].nunique()  # Example: 3 classes for classification



# Load data for training
data, labels = DataUtils.load_raw_data_for_training(wav_folder, frame_length, data_df, max_lentgh)

# Calculate label proportions
unique_labels, label_counts = np.unique(labels, return_counts=True)
label_proportions = label_counts / len(labels)

# Split the data with balanced label distribution
train_data, temp_data, train_labels, temp_labels = train_test_split(data, labels, train_size=0.7, stratify=labels)
remaining_size = len(temp_data)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, stratify=temp_labels)

train_dataset = AudioDataset(train_data, train_labels)
val_dataset = AudioDataset(val_data, val_labels)
test_dataset = AudioDataset(test_data, test_labels)

# Printing the lengths to verify the distribution
print(f"Train set length: {len(train_data)}")
print(f"Validation set length: {len(val_data)}")
print(f"Test set length: {len(test_data)}")

batch_size = 2

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

input_shape = data.shape[2]

model_experiment = ModelUtils(train_loader,val_loader,test_loader,input_shape,num_classes)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

epochs = 1

params = {
            'learning_rate': 0.0001,
            'num_layers': 2,
            'skip_connections': False,
            'dropout_probability': 0.5,
            'hidden_layers': 2,
            'gradient_clipping': False,
            'cell_type': 'rnn',
            'hidden_size': 128,
            'bidirectional': False
            }

input_shape = ()
path = ''
model_experiment.model_summary(train_loader, 'raw_data_rnn_model_summary', input_shape, path)

model_experiment.train_evaluate(params)

model_experiment.test_model()

with open(path+'model_experiment.pkl', 'wb') as file:
    pickle.dump(model_experiment, file)
    print("INFO: pickled model class at: ", path+'model_experiment.pkl')

KeyboardInterrupt: 