<a href="https://colab.research.google.com/github/graylan0/customer-qml/blob/main/GPT4_Vision_Audio_Aesthetics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/suno-ai/bark.git

In [None]:
!pip install openai
!pip install librosa

In [None]:
from google.colab import userdata
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import base64
import requests
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
import uuid
import os
import concurrent.futures


preload_models()
userdata.get('OPENAI_API_KEY')


def generate_audio_for_sentence(sentence):
    try:
        return generate_audio(sentence)
    except Exception as e:
        print("Error in generating audio for sentence: {}".format(e))
        return None

def generate_response(song_elements, num_threads=4):
    pieces = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_element = {executor.submit(generate_audio_for_sentence, element.get('lyrics', element.get('music'))): element for element in song_elements if 'lyrics' in element or 'music' in element}
        for future in concurrent.futures.as_completed(future_to_element):
            try:
                audio_array = future.result()
                if audio_array is not None:
                    pieces.append(audio_array)
                    element = future_to_element[future]
                    if 'pause' in element:
                        silence = np.zeros(int(element['pause'] * SAMPLE_RATE))
                        pieces.append(silence)
            except Exception as e:
                print("Error processing element: {}".format(e))

    if pieces:
        audio = np.concatenate(pieces)
        file_name = str(uuid.uuid4()) + ".wav"
        try:
            write_wav(file_name, SAMPLE_RATE, audio.astype(np.int16))
            return file_name
        except Exception as e:
            print("Error writing audio file: {}".format(e))
            return None
    else:
        return None

def create_spectrogram(audio_path, save_path):
    try:
        y, sr = librosa.load(audio_path)
        plt.figure(figsize=(10, 4))
        S = librosa.feature.melspectrogram(y=y, sr=sr)
        librosa.display.specshow(librosa.power_to_db(S, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-frequency spectrogram')
        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()
    except Exception as e:
        print("Error creating spectrogram: {}".format(e))

def encode_image(image_path):
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except Exception as e:
        print("Error encoding image: {}".format(e))
        return None

def gpt4_vision_agent_prompt(image_path):
    base64_image = encode_image(image_path)
    if not base64_image:
        return None

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer {}".format(os.getenv('OPENAI_API_KEY'))
    }
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Analyze the aesthetics of this spectrogram. Describe its visual qualities, patterns, and any notable features."},
                    {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{}".format(base64_image)}}
                ]
            }
        ],
        "max_tokens": 300
    }
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print("Error in GPT-4 Vision Agent request: {}".format(e))
        return None

def process_audio_and_analyze_aesthetics(song_elements):
    audio_file = generate_response(song_elements)
    if audio_file:
        spectrogram_path = 'spectrogram.jpg'
        create_spectrogram(audio_file, spectrogram_path)
        return gpt4_vision_agent_prompt(spectrogram_path)
    else:
        return None

def gpt4_audio_influencer_mixer_engineer_prompt(analysis_result):
    if not analysis_result:
        return None
    try:
        visual_descriptions = analysis_result['choices'][0]['message']['content']
        prompt = "Based on the following spectrogram analysis: {}, suggest modifications to the music elements to enhance its aesthetic appeal and emotional impact.".format(visual_descriptions)
        modified_elements = interact_with_gpt35(prompt)
        return modified_elements
    except Exception as e:
        print("Error in GPT-4 audio influencer mixer engineer prompt: {}".format(e))
        return None

def super_sim_advance_ai(number_of_iterations=5):
    song_elements = []
    for _ in range(number_of_iterations):
        analysis_result = process_audio_and_analyze_aesthetics(song_elements)
        if analysis_result:
            song_elements = adjust_song_elements(song_elements, analysis_result)

super_sim_advance_ai()
