<a href="https://colab.research.google.com/github/dadaocao/ai-music-generator/blob/main/AI_Lyrics_Generator_Team_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# See integration and demo on: https://github.com/dadaocao/ai-music-generator.git

# Install
!pip install transformers datasets torch kaggle

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"melodyxy","key":"8f36a50cf3111883efcf083aa058fc6e"}'}

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d karnikakapoor/lyrics
!unzip lyrics.zip

Dataset URL: https://www.kaggle.com/datasets/karnikakapoor/lyrics
License(s): CC0-1.0
Archive:  lyrics.zip
  inflating: Songs.csv               


In [None]:
import pandas as pd
from datasets import Dataset

# Loading data
df = pd.read_csv("Songs.csv")

# Check the structure
print(df.head())

# Clean data: Delete empty values ​​+ merge Artist and Title as context
df = df.dropna(subset=["Lyrics"])  # Delete lines without lyrics
df["context"] = "Artist: " + df["Artist"] + " | Title: " + df["Title"]  # Added context column
df = df[["context", "Lyrics"]]  # Keep only the required columns
print(df.head())

         Artist                     Title  \
0  Taylor Swift                  cardigan   
1  Taylor Swift                     exile   
2  Taylor Swift                     Lover   
3  Taylor Swift                     the 1   
4  Taylor Swift  Look What You Made Me Do   

                                              Lyrics  
0  Vintage tee, brand new phone\nHigh heels on co...  
1  I can see you standing, honey\nWith his arms a...  
2  We could leave the Christmas lights up 'til Ja...  
3  I'm doing good, I'm on some new shit\nBeen say...  
4  I don't like your little games\nDon't like you...  
                                             context  \
0             Artist: Taylor Swift | Title: cardigan   
1                Artist: Taylor Swift | Title: exile   
2                Artist: Taylor Swift | Title: Lover   
3                Artist: Taylor Swift | Title: the 1   
4  Artist: Taylor Swift | Title: Look What You Ma...   

                                              Lyrics  
0  Vint

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)  # 90% train，10% test

# demo
print(dataset["train"][0])



In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # set token

def tokenize_function(examples):
    # Merge context and lyrics
    inputs = [f"{ctx}\nLyrics:\n{lyric}" for ctx, lyric in zip(examples["context"], examples["Lyrics"])]
    # Add labels
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"  # Return PyTorch
    )
    tokenized["labels"] = tokenized["input_ids"].clone()  # Copy input_ids as labels
    return tokenized


tokenized_dataset = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/670 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset["train"].features)
print(tokenized_dataset["train"][0].keys())

{'context': Value(dtype='string', id=None), 'Lyrics': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
dict_keys(['context', 'Lyrics', 'input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# 1. Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 2. Training parameter configuration
training_args = TrainingArguments(
    output_dir="./lyrics_finetuned",  # Set path
    per_device_train_batch_size=8,    # Adjustment according to GPU memory
    num_train_epochs=2,               # Epoch
    logging_steps=100,                # Print logs every 100 steps
    save_steps=500,                   # Save the model every 500 steps
    evaluation_strategy="steps",      # Evaluate by step
    eval_steps=500,                   # Evaluate every 500 steps
    fp16=True,                        # GPU acceleration (must support mixed precision)?
    remove_unused_columns=False,      # Prevent labels from being automatically deleted
    report_to="none"                  # Disable W&B logging
)

# 3. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# 4. Start training!
trainer.train()

# 5. Save the finely tuned model
model.save_pretrained("./my_lyrics_model")
tokenizer.save_pretrained("./my_lyrics_model")  # Save word analyzer

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


('./my_lyrics_model/tokenizer_config.json',
 './my_lyrics_model/special_tokens_map.json',
 './my_lyrics_model/vocab.json',
 './my_lyrics_model/merges.txt',
 './my_lyrics_model/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Loading fine-tuned models and word participlers
model_path = "./my_lyrics_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Loading fine-tuned models and word participlers
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on: {device}")

Model loaded on: cpu


In [None]:
def generate_lyrics(artist, title, max_length=200, temperature=0.7):
    # Prompt input
    prompt = f"Artist: {artist} | Title: {title}\nLyrics:\n"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate lyrics
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        temperature=temperature,
        do_sample=True,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode output
    lyrics = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return lyrics

In [None]:
# Test Case：Generate Taylor Swift style lyrics
lyrics1 = generate_lyrics(
    artist="Taylor Swift",
    title="Midnight Rain",
    temperature=0.9  # Control creativity/repeatability（0.1~1.0）
)
print(lyrics1)

Artist: Taylor Swift | Title: Midnight Rain
Lyrics:
Mama, I'll be right back
Baby, you never know what's to come
I can't wait until I hear it from you
Slay my face on the ground
It's all mine
I'll be right back

But it's not right, it's not right

It's not right I'm gonna kill you
I can't wait to get back
I can't wait to get back

Oh, I guess that means you're gonna stay
I don't wanna kill you

Mama, it's gonna be fine
I can't wait for you

Mama, no more pain
Oh my god
Baby, I love you

Mama, you're such a lovely girl
Oh, I like you
Oh, I like you

Baby, you're so much more than me
Oh, I just had to have you
Oh


In [None]:
from google.colab import files
!zip -r my_lyrics_model.zip ./my_lyrics_model
files.download("my_lyrics_model.zip")

  adding: my_lyrics_model/ (stored 0%)
  adding: my_lyrics_model/special_tokens_map.json (deflated 74%)
  adding: my_lyrics_model/tokenizer_config.json (deflated 56%)
  adding: my_lyrics_model/generation_config.json (deflated 24%)
  adding: my_lyrics_model/model.safetensors (deflated 7%)
  adding: my_lyrics_model/config.json (deflated 51%)
  adding: my_lyrics_model/vocab.json (deflated 68%)
  adding: my_lyrics_model/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>