# AudioGen

In [1]:
from audiocraft.models import AudioGen

model = AudioGen.get_pretrained('facebook/audiogen-medium')

  WeightNorm.apply(module, name, dim)


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 8.94 MiB is free. Including non-PyTorch memory, this process has 3.62 GiB memory in use. Of the allocated memory 3.48 GiB is allocated by PyTorch, and 77.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=5
)

NameError: name 'model' is not defined

## Audio Continuation

In [None]:
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

def get_bip_bip(bip_duration=0.125, frequency=440,
                duration=0.5, sample_rate=16000, device="cuda"):
    """Generates a series of bip bip at the given frequency."""
    t = torch.arange(
        int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
    wav = torch.cos(2 * math.pi * frequency * t)[None]
    tp = (t % (2 * bip_duration)) / (2 * bip_duration)
    envelope = (tp >= 0.5).float()
    return wav * envelope

In [None]:
# Here we use a synthetic signal to prompt the generated audio.
res = model.generate_continuation(
    get_bip_bip(0.125).expand(2, -1, -1), 
    16000, ['Whistling with wind blowing', 
            'Typing on a typewriter'], 
    progress=True)
display_audio(res, 16000)

In [None]:
# You can also use any audio from a file. Make sure to trim the file if it is too long!
prompt_waveform, prompt_sr = torchaudio.load("Yun Hi Chala Chal.mp3")
# prompt_waveform, prompt_sr = torchaudio.load("../assets/sirens_and_a_humming_engine_approach_and_pass.mp3")

prompt_duration = 6
prompt_waveform = prompt_waveform[..., :int(prompt_duration * prompt_sr)]
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=prompt_sr, progress=True)
display_audio(output, sample_rate=16000)

### Text-conditional Generation

In [None]:
from audiocraft.utils.notebook import display_audio

output = model.generate(
    descriptions=[
        'Ar Rahman music styled tabla',
        # 'Subway train blowing its horn',
        'A cat meowing',
    ],
    progress=True
)
display_audio(output, sample_rate=16000)

# MAGNeT

In [None]:
from audiocraft.models import MAGNeT

model = MAGNeT.get_pretrained('facebook/magnet-small-10secs')

In [None]:
model.set_generation_params(
    use_sampling=True,
    top_k=0,
    top_p=0.9,
    temperature=3.0,
    max_cfg_coef=10.0,
    min_cfg_coef=1.0,
    decoding_steps=[int(20 * model.lm.cfg.dataset.segment_duration // 10),  10, 10, 10],
    span_arrangement='stride1'
)

### Text-conditional Generation - Music

In [None]:
from audiocraft.utils.notebook import display_audio

###### Text-to-music prompts - examples ######
text = "80s bollywood music with deep instrumentals and base, hindi lyrics and very existential meaning"
# text = "80s electronic track with melodic synthesizers, catchy beat and groovy bass. 170 bpm"
# text = "Earthy tones, environmentally conscious, ukulele-infused, harmonic, breezy, easygoing, organic instrumentation, gentle grooves"
# text = "Funky groove with electric piano playing blue chords rhythmically"
# text = "Rock with saturated guitars, a heavy bass line and crazy drum break and fills."
# text = "A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle"
                   
N_VARIATIONS = 3
descriptions = [text for _ in range(N_VARIATIONS)]

print(f"text prompt: {text}\n")
output = model.generate(descriptions=descriptions, progress=True, return_tokens=True)
display_audio(output[0], sample_rate=model.compression_model.sample_rate)

In [None]:
### Text-conditional Generation - Sound Effects

In [None]:
from audiocraft.models import MAGNeT

model = MAGNeT.get_pretrained('facebook/audio-magnet-small')

In [None]:
model.set_generation_params(
    use_sampling=True,
    top_k=0,
    top_p=0.8,
    temperature=3.5,
    max_cfg_coef=20.0,
    min_cfg_coef=1.0,
    decoding_steps=[int(20 * model.lm.cfg.dataset.segment_duration // 10),  10, 10, 10],
    span_arrangement='stride1'
)

In [None]:
from audiocraft.utils.notebook import display_audio
               
###### Text-to-audio prompts - examples ######
text = "Seagulls squawking as ocean waves crash while wind blows heavily into a microphone."
# text = "A toilet flushing as music is playing and a man is singing in the distance."

N_VARIATIONS = 3
descriptions = [text for _ in range(N_VARIATIONS)]

print(f"text prompt: {text}\n")
output = model.generate(descriptions=descriptions, progress=True, return_tokens=True)
display_audio(output[0], sample_rate=model.compression_model.sample_rate)

# MusicGen


In [None]:
from audiocraft.models import MusicGen
from audiocraft.models import MultiBandDiffusion

USE_DIFFUSION_DECODER = True
# Using small model, better results would be obtained with `medium` or `large`.
model = MusicGen.get_pretrained('facebook/musicgen-small')
if USE_DIFFUSION_DECODER:
    mbd = MultiBandDiffusion.get_mbd_musicgen()

In [None]:
model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=30
)

### Music Continuation

In [None]:
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

def get_bip_bip(bip_duration=0.125, frequency=440,
                duration=0.5, sample_rate=32000, device="cuda"):
    """Generates a series of bip bip at the given frequency."""
    t = torch.arange(
        int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
    wav = torch.cos(2 * math.pi * 440 * t)[None]
    tp = (t % (2 * bip_duration)) / (2 * bip_duration)
    envelope = (tp >= 0.5).float()
    return wav * envelope

In [None]:
# Here we use a synthetic signal to prompt both the tonality and the BPM
# of the generated audio.
res = model.generate_continuation(
    get_bip_bip(0.125).expand(2, -1, -1), 
    32000, ['Jazz jazz and only jazz', 
            'Hindi tabla with flute and sitar',
            ], 
    progress=True)
display_audio(res, 32000)

In [None]:
# You can also use any audio from a file. Make sure to trim the file if it is too long!
# prompt_waveform, prompt_sr = torchaudio.load("../assets/bach.mp3")
prompt_waveform, prompt_sr = torchaudio.load("Yun Hi Chala Chal.mp3")
prompt_duration = 8
prompt_waveform = prompt_waveform[..., :int(prompt_duration * prompt_sr)]
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=prompt_sr, progress=True, return_tokens=True)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

In [None]:
from audiocraft.utils.notebook import display_audio

output = model.generate(
    descriptions=[
        #'80s pop track with bassy drums and synth',
        #'90s rock song with loud guitars and heavy drums',
        #'Progressive rock drum and bass solo',
        #'Punk Rock song with loud drum and power guitar',
        #'Bluesy guitar instrumental with soulful licks and a driving rhythm section',
        #'Jazz Funk song with slap bass and powerful saxophone',
        # 'drum and bass beat with intense percussions',
        'Hindi tabla with flute and sitar'
    ],
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

### Melody-conditional Generation

In [None]:
import torchaudio
from audiocraft.utils.notebook import display_audio

model = MusicGen.get_pretrained('facebook/musicgen-melody')
model.set_generation_params(duration=8)

melody_waveform, sr = torchaudio.load("Yun Hi Chala Chal.mp3")
melody_waveform = melody_waveform.unsqueeze(0).repeat(2, 1, 1)
output = model.generate_with_chroma(
    descriptions=[
        # '80s pop track with bassy drums and synth',
        '90s bollywood music with deep instrumentals and base, hindi lyrics and very existential meaning',
        'Hindi tabla with flute and sitar'
    ],
    melody_wavs=melody_waveform,
    melody_sample_rate=sr,
    progress=True, return_tokens=True
)
display_audio(output[0], sample_rate=32000)
if USE_DIFFUSION_DECODER:
    out_diffusion = mbd.tokens_to_wav(output[1])
    display_audio(out_diffusion, sample_rate=32000)

# Create Dataset

In [None]:
import json
import csv
import pandas as pd

file_path = 'prompt_golden_data.csv'


def csv_to_json(csv_file_path, json_file_path):
    data = []

    # Read the CSV file
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        # Iterate over each row in the CSV file
        for row in csv_reader:
            data.append(row)

    # Write the data to a JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print(f"CSV file '{csv_file_path}' has been converted to JSON file '{json_file_path}'.")

# Example usage
csv_file_path = 'prompt_golden_data.csv'
json_file_path = 'music_lyrics.json'
csv_to_json(csv_file_path, json_file_path)
# Step 1: Load the original JSON data
with open('music_lyrics.json', 'r') as file:
    data = json.load(file)

# Step 2: Modify the data structure
new_data = []

for song in data:
    new_data.append({
        'instruction': 'Write a song lyric based on the given inputs and verse prompt.',
        'input': 
            'genres:'+ song['genres']+
            'progression:'+ song['progression']+
            'start_key'+ song['start_key']+
            'verse prompt'+ song['prompts']
        ,
        'output': 
            'lyrics'+ song['processed_lyrics']  # Ensure this key 'lyrics' or similar is what you want
        
    })

# Step 3: Write the modified data to a new JSON file
with open('restructured.json', 'w') as file:
    json.dump(new_data, file, indent=4)



with open('data/dataset_info.json', 'r') as file:
    data = json.load(file)
    data.append('music_lyrics_generation')
    data['music_lyrics_generation']['file_name']='music_lyrics_generation.json'
    data['music_lyrics_generation']['file_sha1']='7df69e4325ad88feef052b3c086b4434867b120a'
    json.dump(data, file, indent=4)
    


# Create melody of generated lyrics 

In [None]:
import os
import sys
sys.path.append("../")
import yaml
import json
import numpy as np
import torch
import math
import copy
import re
from transformers import AutoModel, AutoTokenizer
# ckpt_path = '/Mar2Ding/songcomposer_pretrain'
ckpt_path = 'Mar2Ding/songcomposer_sft'
tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)
model = AutoModel.from_pretrained(ckpt_path, trust_remote_code=True).cuda().half()

In [None]:
prompt = 'Compose a tune in harmony with the accompanying lyrics. <bol> Total 6 lines.\
The first line: NLP, the class that sets us free\n\
The second line: Prof. Srihari and Sayantal, our guiding team\n\
The third line: Natural Language Processing, oh so fine They want to grade us high, all the time\n\
The fourth line: Parsing, disambiguating, we’re on a roll NLP, you’re in our soul\n\
The fifth line: Natural Language Processing, oh so fine They want to grade us high, all the time\n\
The sixth line: Prof. Srihari, Sayantal Pal, we thank you NLP, our passion true\n<eol>'
####### m2l #######
model.inference(prompt, tokenizer)

In [None]:
line = 'The first line:<E4> , <154> , <88> |<E4> , <134> , <88> |<E4> , <137> , <79> |<F#4> , <151> , <79> |<E4> , <154> , <79> |<D#4> , <154> , <79> | <C#4> , <157> , <79> | <B3> , <172> , <127> The second line:<E4> , <151> , <88> |<E4> , <137> , <88> |<E4> , <137> , <79> |<F#4> , <151> , <79> |<E4> , <151> , <79> |<D#4> , <160> , <79> |<C#4> , <157> , <79> The third line:<B3> , <151> , <79> |<G#3> , <137> , <79> |<B3> , <151> , <79> |<G#3> , <189> , <79> |<F#3> , <157> , <79> |<G#3> , <137> , <79> The fourth line:<G#3> , <147> , <79> |<F#3> , <144> , <79> |<E3> , <151> , <79> |<F#3> , <141> , <79> |<G#3> , <166> , <79> |<B3> , <219> , <160> The fifth line:<E4> , <154> , <88> |<E4> , <130> , <88> |<E4> , <144> , <79> |<F#4> , <147> , <79> |<E4> , <157> , <79> |<D#4> , <154> , <79> |<C#4> , <151> , <79> |<B3> , <118> , <79> |<B3> , <118> , <79> |<G#3> , <207> , <79> |<B3> , <205> , <79> |的, <G#3> , <205> , <79>'
from finetune.utils import gen_midi
gen_midi(line, 'text')

# Finetune LLama 3 model

In [None]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers==0.0.25
!pip install .[bitsandbytes]

In [None]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory")

## Fine-tune model via Command Line


In [None]:
%cd /content/LLaMA-Factory/
!GRADIO_SHARE=1 llamafactory-cli webui


import json

args = dict(
  stage="sft",                        # do supervised fine-tuning
  do_train=True,
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  dataset="identity,alpaca_gpt4_en,",             # use alpaca and identity datasets
  template="llama3",                     # use llama3 prompt template
  finetuning_type="lora",                   # use LoRA adapters to save memory
  lora_target="all",                     # attach LoRA adapters to all linear layers
  output_dir="llama3_lora",                  # the path to save LoRA adapters
  per_device_train_batch_size=2,               # the batch size
  gradient_accumulation_steps=4,               # the gradient accumulation steps
  lr_scheduler_type="cosine",                 # use cosine learning rate scheduler
  logging_steps=10,                      # log every 10 steps
  warmup_ratio=0.1,                      # use warmup scheduler
  save_steps=1000,                      # save checkpoint every 1000 steps
  learning_rate=5e-5,                     # the learning rate
  num_train_epochs=3.0,                    # the epochs of training
  max_samples=500,                      # use 500 examples in each dataset
  max_grad_norm=1.0,                     # clip gradient norm to 1.0
  quantization_bit=4,                     # use 4-bit QLoRA
  loraplus_lr_ratio=16.0,                   # use LoRA+ algorithm with lambda=16.0
  use_unsloth=True,                      # use UnslothAI's LoRA optimization for 2x faster training
  fp16=True,                         # use float16 mixed precision training
)

json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3.json

In [None]:
import json 

with open("data/music_lyrics_generation.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)
  print(dataset[:1])

### Infer the fine-tuned model

In [None]:
from llmtuner.chat import ChatModel
from llmtuner.extras.misc import torch_gc

%cd /content/LLaMA-Factory/

args = dict(
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  adapter_name_or_path="llama3_lora",            # load the saved LoRA adapters
  template="llama3",                     # same to the one in training
  finetuning_type="lora",                  # same to the one in training
  quantization_bit=4,                    # load 4-bit quantized model
  use_unsloth=True,                     # use UnslothAI's LoRA optimization for 2x faster generation
)
chat_model = ChatModel(args)

messages = []
print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
while True:
  query = input("\nUser: ")
  if query.strip() == "exit":
    break
  if query.strip() == "clear":
    messages = []
    torch_gc()
    print("History has been removed.")
    continue

  messages.append({"role": "user", "content": query})
  print("Assistant: ", end="", flush=True)

  response = ""
  for new_text in chat_model.stream_chat(messages):
    print(new_text, end="", flush=True)
    response += new_text
  print()
  messages.append({"role": "assistant", "content": response})

torch_gc()

In [None]:
from llmtuner import ChatModel
from llmtuner.extras.misc import torch_gc


chat_model = ChatModel(dict(
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  adapter_name_or_path="llama3_lora",            # load the saved LoRA adapters
  finetuning_type="lora",                  # same to the one in training
  template="llama3",                     # same to the one in training
  quantization_bit=4,                    # load 4-bit quantized model
  use_unsloth=True,                     # use UnslothAI's LoRA optimization for 2x faster generation
))

messages = []
while True:
  query = input("\nUser: ")
  if query.strip() == "exit":
    break

  if query.strip() == "clear":
    messages = []
    torch_gc()
    print("History has been removed.")
    continue

    

    query = "'instruction': 'Write a song lyric based on the given inputs and verse prompt.'

    'user': 'genres': ['canadian pop', 'pop', 'post-teen pop']
            'progression':['A', 'Em', 'G']
            'start_key': 'Bm'
            'verse prompt': 'I was fifteen when the world put me on a pedestal and told me Im the best.'
    "
    messages.append({"role": "user", "content": query})     # add query to messages
    print("Assistant: ", end="", flush=True)
    response = ""
    for new_text in chat_model.stream_chat(messages):      # stream generation
      print(new_text, end="", flush=True)
      response += new_text
    print()
    messages.append({"role": "assistant", "content": response}) # add response to messages

torch_gc()