In [12]:
# main.py
import boto3
from decouple import config
import sqlite3
import torch
import pathlib as Path
import boto3
import torchaudio
from tqdm import tqdm
import numpy as np
import os
from smart_open import open

from crossmodal_alignment.retrieval_model import TransformersModel

In [2]:

# Load S3-related configurations from the .env file
aws_access_key_id = config('AWS_ACCESS_KEY_ID')
aws_secret_access_key = config('AWS_SECRET_ACCESS_KEY')
bucket_name = config('AWS_BUCKET_NAME')
bucket_region = config('AWS_DEFAULT_REGION')
database_name = config('DATABASE_NAME')

In [3]:



def main():


    # Initialize the S3 client
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key,
                      region_name=bucket_region)

    # Connect to the SQLite database
    conn = sqlite3.connect(database_name)
    c = conn.cursor()

    # List objects (files) in the specified S3 folder with the given pattern
    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix='data/audio/dh-new_scapes')

    if 'Contents' in objects:
        for obj in objects['Contents']:
            key = obj['Key']
            if key.endswith(".wav"):
                print(f"Reading file: {key}")

                # Read the audio file content directly from S3 using smart_open
                with open(f's3://{bucket_name}/{key}', 'rb') as file:
            
                    # Now you can insert the S3 link and other relevant data into your SQLite database
                    # For example, insert it into a table named 'audio_files'
                    c.execute("INSERT INTO audio_files (s3_link, file_name) VALUES (?, ?)", (f's3://{bucket_name}/{key}', key))

    # Commit the changes and close the database connection
    conn.commit()
    conn.close()




In [4]:
def load_audio_input(audio_path: Path, sampling_rate: int):
    # if audio_path.suffix == ".npy":
    #     return torch.from_numpy(np.load(audio_path))
    # else:
    audio, sr = torchaudio.load(audio_path)
    audio = torchaudio.functional.resample(audio, sr, sampling_rate)
    return audio.mean(0)

In [15]:

def build_audio_index_s3(bucket_name, folder_path, _audio_encoder, sampling_rate):

    # Initialize the S3 client
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key,
                      region_name=bucket_region)

    # List objects (files) in the specified S3 folder with the given pattern
    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    

    s3_file_names = []
    audios = []

    if 'Contents' in objects:
        for obj in objects['Contents'][:10]:
            key = obj['Key']
            print(f"Reading file: {key}")
            s3_file_path = f's3://{bucket_name}/{key}'
            # Read the audio file content directly from S3 using smart_open
            with open(s3_file_path, 'rb') as file:
                # Modify the load_audio_input function to handle S3 file path
                input_audio = load_audio_input(file, sampling_rate)
                embedded_audio = _audio_encoder(input_audio)
            audios.append(embedded_audio)
            s3_file_names.append(s3_file_path)

    return torch.stack(audios), s3_file_names


In [16]:
model = TransformersModel()
folder_path = 'dh-new_scapes/'

ref_audios, ref_names = build_audio_index_s3(bucket_name, folder_path, model.get_audio_embedding,  sampling_rate=model.sampling_rate)



Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_car_horn_1.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_car_horn_12.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_car_horn_4.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_car_horn_8.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_car_horn_9.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_children_playing_1.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_children_playing_12.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_children_playing_4.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_children_playing_8.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Reading file: dh-new_scapes/uniform_soundscape_air_conditioner_children_playing_9.wav


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [19]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')

# Rest of your Streamlit app code...
print(AWS_ACCESS_KEY_ID)

AKIA23A2WJMGZPGAHI7W
