<a href="https://colab.research.google.com/github/designingEmergence/CircuitBendingTests/blob/main/colab_notebooks/Circuit_Bend_Text_to_Audio_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Step 2: Install packages

!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git scipy soundfile

In [None]:
#Step 3: Import libraries

import torch
import torch.nn as nn
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio
import os
import csv
from datetime import datetime
import copy
import soundfile as sf

In [None]:
#Step 4: Setup directories

#create log directory and file -----REPLACE THIS WITH YOUR OWN DIRECTORY-----

log_directory = '/content/drive/MyDrive/Projects/Circuit_Bending_AI/experiment-logs'
os.makedirs(log_directory, exist_ok=True)
log_file = os.path.join(log_directory, 'experiment_log_text_to_audio.csv')

if not os.path.isfile(log_file):
    with open(log_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['timestamp', 'model_name', 'layer_name', 'noise_factor', 'prompt', 'output_file'])

#directory where the .wav samples will be stored -----REPLACE THIS WITH YOUR OWN DIRECTORY-----
samples_directory = '/content/drive/MyDrive/Projects/Circuit_Bending_AI/experiment-logs/text-to-audio-samples'
os.makedirs(samples_directory, exist_ok=True)


In [None]:
#Step 5: Import Model and processor from Hugging Face

model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-stereo-small").to("cuda")
processor = AutoProcessor.from_pretrained("facebook/musicgen-stereo-small")
# model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-medium").to("cuda")
# processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")

config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
#Step 6: Define functions

#Add noise to specific layers in a model
def add_noise_to_weights(model, noise_level=0.01, layer_query=None):
  print ("Bending model weights...")

  # create a copy of the model to preserve the original
  model_copy = copy.deepcopy(model)

  #transform layer_query into array if necessary
  if isinstance(layer_query, str):
    layer_query = [layer_query]

  for layer_name, param in model_copy.named_parameters():
    #print(f'Layer param: {name}')
    # If layer_query isn't passed, all layers will be manipulated
    if layer_query is None or any(query in layer_name for query in layer_query):
      print(f"Bending {layer_name}...")
      noise = torch.randn_like(param) * noise_level
      param.data += noise

  return model_copy

def log_experiment(model_name, layer_name, noise_factor, prompt, output_file):
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  with open(log_file, 'a', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([timestamp, model_name, layer_name, noise_factor, prompt, output_file])

def save_audio_results(model, layer_query, noise_level, prompt, audio_values, samples_directory):
    """
    Saves the generated audio to a WAV file and logs the experiment details.

    Args:
        model: The bent (modified) Musicgen model.
        layer_query: The layer name used for noise injection.
        noise_level: The amount of noise added to the layer.
        prompt: The text prompt used for audio generation.
        audio_values: The generated audio data.
        samples_directory: The directory to save the audio file.
    """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    output_file_name = timestamp +  '_' + layer_query + '_' + str(noise_level) + '_' + prompt
    sampling_rate = model.config.audio_encoder.sampling_rate

    # Replace '/' with '_' in the output_file_name to avoid potential path issues in Google Drive
    output_file_name = output_file_name.replace('/', '_')

    output_file = os.path.join(samples_directory, output_file_name + '.wav')

    # Error handling for sf.write
    try:
        sf.write(output_file, audio_values[0].T, sampling_rate)
        log_experiment(model.config.name_or_path, layer_query, noise_level, prompt, output_file_name)
    except Exception as e:
        print(f"Error saving audio file: {e}")
        # Additional debugging information
        print(f"Output file path: {output_file}")

In [None]:
#Step 7: Run Standard Model
prompt = "high pitched arpeggiated bleeps"

inputs = processor(
    text=[prompt],
    padding=True,
    return_tensors="pt",
).to("cuda")

#max_new_tokens defines length of audio. 256 = 5 second of audio
audio_values = model.generate(**inputs, max_new_tokens=512)
sampling_rate = model.config.audio_encoder.sampling_rate
audio_values = audio_values.cpu()
del inputs
torch.cuda.empty_cache()

audio_values = audio_values.numpy()
Audio(audio_values[0], rate=sampling_rate)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 22.17 GiB of which 2.88 MiB is free. Process 49466 has 22.16 GiB memory in use. Of the allocated memory 20.49 GiB is allocated by PyTorch, and 1.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#Step 7.5: Save Standard Model Run

save_audio_results(model, 'standard', '0', prompt, audio_values, samples_directory)

In [None]:
#Step 8: Bend model
#adjust the noise level and layer_query to configure the bend


#between 0 and 1, in general start with 0.05 and work your way up (or down)
noise_level = 0.01

#Full list of layers and their parameters can be found here
#https://aluminum-canid-c49.notion.site/Bending-Layer-Types-Text-to-Audio-1599a3a56ba78009ace8f287dd6d56fb?pvs=4
layer_query = "decoder.model.decoder.embed_tokens"

# Clear GPU cache before bending
torch.cuda.empty_cache()

bent_model = add_noise_to_weights(model, noise_level=noise_level, layer_query=layer_query)

Bending model weights...


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 22.17 GiB of which 2.88 MiB is free. Process 49466 has 22.16 GiB memory in use. Of the allocated memory 20.49 GiB is allocated by PyTorch, and 1.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#Step 9: Run Bent Model

prompt = "jazzy piano"

inputs = processor(
    text= [prompt],
    padding=True,
    return_tensors="pt",
).to("cuda")

audio_values = bent_model.generate(**inputs, max_new_tokens=512)
sampling_rate = bent_model.config.audio_encoder.sampling_rate
audio_values = audio_values.cpu().numpy()

Audio(audio_values[0], rate=sampling_rate)

In [None]:
# Step 10: Save results

save_audio_results(bent_model, layer_query, noise_level, prompt, audio_values, samples_directory)