In [1]:
import librosa
import os
import numpy as np
import pandas as pd

# Assuming recordings_path is the path to your AudioMNIST dataset
recordings_path = '/content/drive/MyDrive/CMPE_255_DataPrep/Task5/AudioMNIST/recordings'  # Adjust this path as needed

# List all audio files
audio_files = [f for f in os.listdir(recordings_path) if f.endswith('.wav')]


In [2]:
mfcc_features = []
labels = []

for file in audio_files:
    y, sr = librosa.load(os.path.join(recordings_path, file), sr=None)

    # Extract MFCCs and take the mean across time frames
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)
    mfcc_features.append(mfccs)

    # Assuming the label is part of the filename, e.g., "3_jackson_43.wav" for digit 3.
    labels.append(int(file.split("_")[0]))

# Create a DataFrame
df = pd.DataFrame(mfcc_features)
df['label'] = labels




In [3]:
pip install pandas-profiling


Collecting pandas-profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ydata-profiling (from pandas-profiling)
  Downloading ydata_profiling-4.6.1-py2.py3-none-any.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.5/357.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic>=2 (from ydata-profiling->pandas-profiling)
  Downloading pydantic-2.4.2-py3-none-any.whl (395 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.8/395.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]==0.7.5 (from ydata-profiling->pandas-profiling)
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.7/102.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from yda

In [7]:
!pip install pandas_profiling --upgrade
!pip install typing_extensions --upgrade




In [9]:
pip install sweetviz


Collecting sweetviz
  Downloading sweetviz-2.2.1-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.2.1


In [11]:
import sweetviz as sv

report = sv.analyze(df)
report.show_html("/content/drive/MyDrive/CMPE_255_DataPrep/Task5/AudioMNIST/sweet_viz.html")


                                             |          | [  0%]   00:00 -> (? left)

Report /content/drive/MyDrive/CMPE_255_DataPrep/Task5/AudioMNIST/sweet_viz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [13]:
pip install tpot


Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/87.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected 

In [14]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
print("Test Score:", tpot.score(X_test, y_test))


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9119999999999999

Generation 2 - Current best internal CV score: 0.9351111111111111

Generation 3 - Current best internal CV score: 0.9351111111111111

Generation 4 - Current best internal CV score: 0.9466666666666667

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)
Test Score: 0.9466666666666667
