
<div class="alert alert-success" style = "border-radius:10px;border-width:3px;border-color:white;font-family:Verdana,sans-serif;font-size:16px;">
<h2>Audio files exploration</h2>

This Jupyter notebook is dedicated to the initial phase of data exploration for our project. In it, we begin by downloading the audio files from the designated S3 bucket, which are essential for our analysis. Once downloaded, we conduct a series of exploratory analyses on these audio files, as the sampling rate and the duration of the audio files.

In [None]:
#!pip install pydub
#!conda install -c anaconda ffmpeg -y

In [7]:
import json
import boto3
import os
from sagemaker.jumpstart import utils
from pydub import AudioSegment

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [8]:
s3_bucket = utils.get_jumpstart_content_bucket(boto3.Session().region_name)
key_prefix = "generali-es-sandbox-crm-voice-samples/set-sac-002/"
input_audio_file_name = "194167844752840717.mp3"

s3_client = boto3.client("s3")

In [10]:
s3 = boto3.resource('s3')
s3_client = boto3.client("s3")
my_bucket = s3.Bucket('generali-es-sandbox-crm-voice-samples')
key_prefix = 'set-sac-002'  # Specify the folder you want to process
output_folder = 'set-sac-002'  # Specify the output folder name
string_list = []

for my_bucket_object in my_bucket.objects.filter(Prefix=key_prefix):
    parts = my_bucket_object.key.split('/')
    if len(parts) == 2 and parts[1]:
        string_list.append(parts[1])
        
        try:
            # Download the object and save it to the "samples" folder
            output_file_path = os.path.join(output_folder, parts[1])
            s3_client.download_file(my_bucket.name, my_bucket_object.key, output_file_path)
        except Exception as e:
            print(f"Error downloading {my_bucket_object.key}: {e}")

print(string_list)

['194168605277650801.mp3', '194168606434562669.mp3', '194168606612365203.mp3', '194168614568951389.mp3', '194168621997425321.mp3', '194168623392741371.mp3', '194168629939411029.mp3', '194168631809044828.mp3', '194168632660947924.mp3', '194168846235520202.mp3', '194168848538950766.mp3', '19416885444238515.mp3', '194168856040440527.mp3', '194168857982752288.mp3', '194168863289911256.mp3', '194168863866624258.mp3', '194168863895925074.mp3', '194168864624538335.mp3', '194168865336742168.mp3', '194168865805644855.mp3', '194168865817044938.mp3', '194168866156947235.mp3', '19416889747815611.mp3', '194168899709947375.mp3', '194168899958848899.mp3', '194168900261251454.mp3', '194168900479353328.mp3', '194168915454420857.mp3', '194168926105658648.mp3', '19416893179071571.mp3', '194168932324814157.mp3', '194168935090249590.mp3', '1941689574131503.mp3', '194168958983631781.mp3', '194168959096134437 (1).mp3', '194168959096134437.mp3', '194168977813847916.mp3', '194168986571440776.mp3', '19416898739

In [18]:
# Load the audio file
audio = AudioSegment.from_mp3("samples_prueba/194168605277650801.mp3")

# Determine the sampling rate
sampling_rate = audio.frame_rate
print("Sampling Rate:", sampling_rate, "Hz")
print(len(audio))

Sampling Rate: 8000 Hz
306640


In [12]:
def get_audio_duration_ms(audio_path):
    """Returns the duration of the audio file in milliseconds."""
    audio = AudioSegment.from_mp3(audio_path)
    return len(audio)

In [13]:
def seconds_to_min_sec(seconds):
    """Converts seconds to a string format 'X min and Y seconds'."""
    minutes = int(seconds // 60)
    seconds = int(seconds % 60)
    return f"{minutes} min and {seconds} seconds"

In [15]:
# Directory containing the audio files
audio_folder = 'all_samples'

# List to store the duration of each audio file
durations_ms = []

# Iterate over each file in the directory
for filename in os.listdir(audio_folder):
    if filename.endswith('.mp3'):
        file_path = os.path.join(audio_folder, filename)
        duration = get_audio_duration_ms(file_path)
        durations_ms.append(duration)

# Convert durations to seconds for easier interpretation
durations_sec = [d / 1000 for d in durations_ms]

# Calculate statistics
total_duration_sec = sum(durations_sec)
max_duration_sec = max(durations_sec)
min_duration_sec = min(durations_sec)
mean_duration_sec = total_duration_sec / len(durations_sec)

print(f"Total Time: {seconds_to_min_sec(total_duration_sec)}")
print(f"Max Time: {seconds_to_min_sec(max_duration_sec)}")
print(f"Min Time: {seconds_to_min_sec(min_duration_sec)}")
print(f"Mean Time: {seconds_to_min_sec(mean_duration_sec)}")

Total Time: 511 min and 50 seconds
Max Time: 9 min and 54 seconds
Min Time: 2 min and 3 seconds
Mean Time: 5 min and 48 seconds
