In [3]:
import os
import librosa
from bigdl.util.common import init_engine, create_spark_conf
from bigdl.nn.layer import *
from bigdl.nn.criterion import ClassNLLCriterion
from bigdl.optim.optimizer import *
from pyspark import SparkContext
from pyspark.sql import SparkSession
from bigdl.util.common import Sample
import numpy as np

# Define the function to extract features
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    return mfcc.flatten()

# Initialize Spark BigDL engine
conf = create_spark_conf().setMaster("local[*]").setAppName("Audio Classification with CNN")
sc = SparkContext(conf=conf)
init_engine()

# Initialize SparkSession
spark = SparkSession.builder.config(conf=sc.getConf()).getOrCreate()

# Load and preprocess the data
base_dir = os.path.expanduser('~/Downloads/data')
audio_files = []
labels = []

genres = os.listdir(base_dir)
for label, genre in enumerate(genres):
    genre_path = os.path.join(base_dir, genre)
    files = [os.path.join(genre_path, file) for file in os.listdir(genre_path) if file.endswith('.wav')]
    for file in files:
        features = extract_features(file)
        audio_files.append(features)
        labels.append(label)

# Convert data to RDD of Sample
rdd = sc.parallelize(zip(audio_files, labels)).map(
    lambda x: Sample.from_ndarray(np.array(x[0], dtype=np.float32), x[1] + 1)
)

# Split data into training and testing
train_rdd, test_rdd = rdd.randomSplit([0.8, 0.2])

# Build the model
model = Sequential()
model.add(Reshape([1, 20, -1]))
model.add(SpatialConvolution(1, 16, 5, 5))
model.add(ReLU())
model.add(SpatialMaxPooling(2, 2, 2, 2))
model.add(SpatialConvolution(16, 32, 3, 3))
model.add(ReLU())
model.add(SpatialMaxPooling(2, 2, 2, 2))
model.add(Flatten())
model.add(Linear(32 * 4 * 4, 100))
model.add(ReLU())
model.add(Linear(100, len(genres)))
model.add(LogSoftMax())

# Create an optimizer
optimizer = Optimizer(
    model=model,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    optim_method=Adam(),
    end_trigger=MaxEpoch(10),
    batch_size=64
)

# Train the model
trained_model = optimizer.optimize()

# Evaluate the model
predictions = trained_model.predict(test_rdd)
predicted_labels = predictions.collect()

# Printing a simple accuracy metric (for demonstration)
correct = sum(1 for pred, label in zip(predicted_labels, test_rdd.collect()) if np.argmax(pred) == label.label - 1)
accuracy = correct / float(test_rdd.count())
print("Test set accuracy:", accuracy)

# Stop Spark context
sc.stop()


ModuleNotFoundError: No module named 'bigdl.util'