# Train LLMs for rap generation

Check runtime is set to GPU. Hopefully Tesla T4 shows up below.

In [1]:
!nvidia-smi

Fri Mar 31 09:44:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers datasets
from google.colab import drive
drive.mount('./mydata')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collectin

## Get train and test sets

In [3]:
import pandas as pd

In [4]:
from datasets import Dataset
train_df = pd.read_csv('mydata/MyDrive/train.csv')
test_df = pd.read_csv('mydata/MyDrive/test.csv')
train_lyrics = [lyric for lyric in list(train_df['lyric']) if isinstance(lyric, str)]
test_lyrics = [lyric for lyric in list(test_df['lyric']) if isinstance(lyric, str)]
train = Dataset.from_dict({'text': train_lyrics})
test = Dataset.from_dict({'text': test_lyrics})
print(train, test)

Dataset({
    features: ['text'],
    num_rows: 16824
}) Dataset({
    features: ['text'],
    num_rows: 4208
})


## Baseline

### Strong baseline: GPT2-large

In [15]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [16]:
from datasets import load_dataset
datasets = load_dataset("csv", data_files={
    "train": 'mydata/MyDrive/train.csv', "validation": 'mydata/MyDrive/test.csv'
    })
datasets = datasets.filter(lambda example: isinstance(example["lyric"], str))
datasets



  0%|          | 0/2 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 4208
    })
})

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['lyric'], truncation=True, max_length=1024)

In [18]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["artist", "song", "lyric"])
tokenized_datasets

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4208
    })
})

In [19]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [20]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

In [21]:
from transformers import Trainer, TrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-not_trained",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=2, 
)
# Don't train, just define to get perplexity

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [23]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 72.25


### Weak baseline: Distilled GPT2

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset
datasets = load_dataset("csv", data_files={
    "train": 'mydata/MyDrive/train.csv', "validation": 'mydata/MyDrive/test.csv'
    })
datasets = datasets.filter(lambda example: isinstance(example["lyric"], str))
datasets

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7957b17e5f9de427/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7957b17e5f9de427/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/16877 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4220 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 4208
    })
})

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['lyric'], truncation=True, max_length=1024)

In [8]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["artist", "song", "lyric"])
tokenized_datasets

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4208
    })
})

In [9]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

In [12]:
from transformers import Trainer, TrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-not_trained",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=2, 
)
# Don't train, just define to get perplexity

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [14]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 132.44


## Fine-tuning

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained('mydata/MyDrive/model').to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
datasets = load_dataset("csv", data_files={
    "train": 'mydata/MyDrive/train.csv', "validation": 'mydata/MyDrive/test.csv'
    })
datasets = datasets.filter(lambda example: isinstance(example["lyric"], str))
datasets

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7957b17e5f9de427/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7957b17e5f9de427/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/16877 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4220 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['artist', 'song', 'lyric'],
        num_rows: 4208
    })
})

In [None]:
datasets['train']['lyric'][0]

'knockin doors down showin parts around\nima come through and show my wood pine\nknockin knockin doors down showin parts around\nima ima come through and show my wood pine ima ima come\nknockin doors knockin knockin doors down down\nknockin doors knock knock knock knock knockin doors down\nimaima come through ima ima come down\nima come through ima ima come down \npaper chasin on occasion\nwest deer park thats my location\nlife and death what we was facin\nboy gotta get that motivation\nback in the basement\nlook at my idols im feelin adjacent\ndoin the shit that i wish i was doing im feelin impatient\nthrowin parties and catching cases\nthats the life but that shit basic\nive cooked crack i cant erase it\nif i could would not replace it\ngrew up all alone\nhad to teach myself to tie my laces\nwhats the deal\nima play the field\nrattpack we keep it real\nnothin but love for the people that paved the way\nso a brother like me he can get it today\nantoine erykah and 3k\ncant forget those

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['lyric'], truncation=True, max_length=1024)

In [None]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["artist", "song", "lyric"])
tokenized_datasets

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4208
    })
})

In [None]:
tokenizer.decode(tokenized_datasets['train']['input_ids'][0])

'knockin doors down showin parts around\nima come through and show my wood pine\nknockin knockin doors down showin parts around\nima ima come through and show my wood pine ima ima come\nknockin doors knockin knockin doors down down\nknockin doors knock knock knock knock knockin doors down\nimaima come through ima ima come down\nima come through ima ima come down \npaper chasin on occasion\nwest deer park thats my location\nlife and death what we was facin\nboy gotta get that motivation\nback in the basement\nlook at my idols im feelin adjacent\ndoin the shit that i wish i was doing im feelin impatient\nthrowin parties and catching cases\nthats the life but that shit basic\nive cooked crack i cant erase it\nif i could would not replace it\ngrew up all alone\nhad to teach myself to tie my laces\nwhats the deal\nima play the field\nrattpack we keep it real\nnothin but love for the people that paved the way\nso a brother like me he can get it today\nantoine erykah and 3k\ncant forget those

In [None]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/16824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4208 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-rap1",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=2, 
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.6379,3.748195
2,3.6065,3.712628


TrainOutput(global_step=22560, training_loss=3.582404184003248, metrics={'train_runtime': 5138.2216, 'train_samples_per_second': 35.122, 'train_steps_per_second': 4.391, 'total_flos': 5894397424041984.0, 'train_loss': 3.582404184003248, 'epoch': 2.0})

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 40.96


In [None]:
trainer.save_model('mydata/MyDrive/model')

# Evaluation

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install eng_to_ipa syllables

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eng_to_ipa
  Downloading eng_to_ipa-0.0.2.tar.gz (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syllables
  Downloading syllables-1.0.7-py3-none-any.whl (15 kB)
Collecting importlib-metadata<6.0.0,>=5.1.0
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Collecting cmudict<2.0.0,>=1.0.11
  Downloading cmudict-1.0.13-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.3/939.3 KB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: eng_to_ipa
  Building wheel for eng_to_ipa (setup.py) ... [?25l[?25hdone
  Created wheel for eng_to_ipa: filename=eng_to_ipa-0.0.2-py3-none-any.whl size=2822631 sha256=3daa7bcfdf9f4eac0f64cc2262f5ec365dd0c26e505535debc

In [None]:
import lyrics

In [None]:
import importlib
importlib.reload(lyrics)

<module 'lyrics' from '/content/lyrics.py'>

In [None]:
first_lines = [song[:song.index('\n')+1] for song in test_df['lyric'] if isinstance(song, str) and '\n' in song]

In [None]:
first_lines[0]

'godfathers in the house\n'

## Our fine-tuned model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
model = AutoModelForCausalLM.from_pretrained('mydata/MyDrive/mlp/model').to(device)

In [None]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f73188f29b0>

In [None]:
rd = []
scd = []
lr = []
uw = []
i = 1
for first_line in first_lines:
  if i % 20 == 1:
    print(f"Processing {i}/{len(first_lines)}")
  i += 1

  input_ids = tokenizer([first_line], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100, 
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  rd.append(lyrics.get_rhyme_density(o))
  scd.append(lyrics.get_syllable_count_difference(o))
  lr.append(lyrics.get_longest_rhyme(o))
  uw.append(lyrics.get_unique_words(o))

Processing 1/4200
Processing 21/4200
Processing 41/4200
Processing 61/4200
Processing 81/4200
Processing 101/4200
Processing 121/4200
Processing 141/4200
Processing 161/4200
Processing 181/4200
Processing 201/4200


Input length of input_ids is 105, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 221/4200
Processing 241/4200


Input length of input_ids is 114, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 261/4200
Processing 281/4200
Processing 301/4200
Processing 321/4200


Input length of input_ids is 122, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 341/4200
Processing 361/4200
Processing 381/4200
Processing 401/4200
Processing 421/4200
Processing 441/4200
Processing 461/4200


Input length of input_ids is 109, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 481/4200
Processing 501/4200
Processing 521/4200
Processing 541/4200
Processing 561/4200
Processing 581/4200
Processing 601/4200
Processing 621/4200
Processing 641/4200
Processing 661/4200
Processing 681/4200
Processing 701/4200
Processing 721/4200
Processing 741/4200
Processing 761/4200
Processing 781/4200
Processing 801/4200
Processing 821/4200
Processing 841/4200
Processing 861/4200
Processing 881/4200
Processing 901/4200


Input length of input_ids is 120, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 921/4200
Processing 941/4200
Processing 961/4200
Processing 981/4200
Processing 1001/4200
Processing 1021/4200
Processing 1041/4200
Processing 1061/4200
Processing 1081/4200
Processing 1101/4200
Processing 1121/4200
Processing 1141/4200
Processing 1161/4200
Processing 1181/4200
Processing 1201/4200
Processing 1221/4200
Processing 1241/4200
Processing 1261/4200
Processing 1281/4200
Processing 1301/4200
Processing 1321/4200
Processing 1341/4200
Processing 1361/4200
Processing 1381/4200
Processing 1401/4200
Processing 1421/4200
Processing 1441/4200
Processing 1461/4200
Processing 1481/4200
Processing 1501/4200
Processing 1521/4200
Processing 1541/4200
Processing 1561/4200
Processing 1581/4200
Processing 1601/4200
Processing 1621/4200
Processing 1641/4200
Processing 1661/4200
Processing 1681/4200
Processing 1701/4200
Processing 1721/4200
Processing 1741/4200
Processing 1761/4200
Processing 1781/4200
Processing 1801/4200
Processing 1821/4200
Processing 1841/4200
Processing 1861/4

Input length of input_ids is 113, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 2461/4200
Processing 2481/4200
Processing 2501/4200


Input length of input_ids is 106, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 2521/4200
Processing 2541/4200
Processing 2561/4200
Processing 2581/4200
Processing 2601/4200
Processing 2621/4200
Processing 2641/4200
Processing 2661/4200
Processing 2681/4200
Processing 2701/4200
Processing 2721/4200
Processing 2741/4200
Processing 2761/4200
Processing 2781/4200
Processing 2801/4200
Processing 2821/4200
Processing 2841/4200
Processing 2861/4200
Processing 2881/4200
Processing 2901/4200
Processing 2921/4200
Processing 2941/4200
Processing 2961/4200
Processing 2981/4200


Input length of input_ids is 120, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 3001/4200
Processing 3021/4200
Processing 3041/4200
Processing 3061/4200
Processing 3081/4200
Processing 3101/4200
Processing 3121/4200
Processing 3141/4200
Processing 3161/4200
Processing 3181/4200
Processing 3201/4200
Processing 3221/4200
Processing 3241/4200
Processing 3261/4200
Processing 3281/4200
Processing 3301/4200
Processing 3321/4200
Processing 3341/4200
Processing 3361/4200
Processing 3381/4200
Processing 3401/4200
Processing 3421/4200
Processing 3441/4200


Input length of input_ids is 216, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 3461/4200
Processing 3481/4200
Processing 3501/4200
Processing 3521/4200
Processing 3541/4200
Processing 3561/4200
Processing 3581/4200
Processing 3601/4200
Processing 3621/4200
Processing 3641/4200
Processing 3661/4200
Processing 3681/4200


Input length of input_ids is 143, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 3701/4200
Processing 3721/4200
Processing 3741/4200
Processing 3761/4200
Processing 3781/4200
Processing 3801/4200
Processing 3821/4200
Processing 3841/4200
Processing 3861/4200
Processing 3881/4200
Processing 3901/4200
Processing 3921/4200
Processing 3941/4200
Processing 3961/4200
Processing 3981/4200
Processing 4001/4200
Processing 4021/4200
Processing 4041/4200
Processing 4061/4200
Processing 4081/4200
Processing 4101/4200
Processing 4121/4200
Processing 4141/4200
Processing 4161/4200
Processing 4181/4200


In [None]:
import numpy as np
rd = np.array(rd)
scd = np.array(scd)
lr = np.array(lr)
uw = np.array(uw)

In [None]:
print(round(np.mean(rd),2), round(np.std(rd),2))

0.54 0.24


In [None]:
print(round(np.mean(scd),2), round(np.std(scd),2))

2.2 4.57


In [None]:
print(round(np.mean(lr),2), round(np.std(lr),2))

3.3 1.13


In [None]:
print(round(np.mean(uw),2), round(np.std(uw),2))

0.51 0.15


## Baseline DistilGPT2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f22d42620b0>

In [None]:
prompt = '(Below are ONLY lyrics from a rap song)\n\n'

In [None]:
# rd = []
# scd = []
# lr = []
uw = []
i = 1
for first_line in first_lines:
  if i % 20 == 1:
    print(f"Processing {i}/{len(first_lines)}")
  i += 1

  input_ids = tokenizer([prompt + first_line], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  o = o.replace(prompt, '')
  # rd.append(lyrics.get_rhyme_density(o))
  # scd.append(lyrics.get_syllable_count_difference(o))
  # lr.append(lyrics.get_longest_rhyme(o))
  try:
    uw.append(lyrics.get_unique_words(o))
  except ZeroDivisionError:
    continue

Processing 1/4200
Processing 21/4200
Processing 41/4200
Processing 61/4200
Processing 81/4200
Processing 101/4200
Processing 121/4200
Processing 141/4200
Processing 161/4200
Processing 181/4200
Processing 201/4200


Input length of input_ids is 117, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 221/4200
Processing 241/4200


Input length of input_ids is 126, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 261/4200
Processing 281/4200
Processing 301/4200


Input length of input_ids is 106, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 321/4200


Input length of input_ids is 134, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 341/4200
Processing 361/4200
Processing 381/4200
Processing 401/4200
Processing 421/4200
Processing 441/4200
Processing 461/4200


Input length of input_ids is 121, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 481/4200
Processing 501/4200
Processing 521/4200
Processing 541/4200
Processing 561/4200
Processing 581/4200
Processing 601/4200
Processing 621/4200
Processing 641/4200
Processing 661/4200
Processing 681/4200
Processing 701/4200
Processing 721/4200
Processing 741/4200
Processing 761/4200
Processing 781/4200
Processing 801/4200
Processing 821/4200
Processing 841/4200
Processing 861/4200
Processing 881/4200
Processing 901/4200


Input length of input_ids is 132, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 921/4200
Processing 941/4200
Processing 961/4200
Processing 981/4200
Processing 1001/4200
Processing 1021/4200
Processing 1041/4200
Processing 1061/4200
Processing 1081/4200
Processing 1101/4200
Processing 1121/4200
Processing 1141/4200
Processing 1161/4200
Processing 1181/4200
Processing 1201/4200
Processing 1221/4200
Processing 1241/4200


Input length of input_ids is 100, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 1261/4200
Processing 1281/4200
Processing 1301/4200
Processing 1321/4200
Processing 1341/4200
Processing 1361/4200
Processing 1381/4200
Processing 1401/4200
Processing 1421/4200
Processing 1441/4200
Processing 1461/4200
Processing 1481/4200
Processing 1501/4200
Processing 1521/4200
Processing 1541/4200
Processing 1561/4200
Processing 1581/4200
Processing 1601/4200
Processing 1621/4200
Processing 1641/4200
Processing 1661/4200
Processing 1681/4200
Processing 1701/4200
Processing 1721/4200
Processing 1741/4200
Processing 1761/4200
Processing 1781/4200
Processing 1801/4200
Processing 1821/4200
Processing 1841/4200
Processing 1861/4200
Processing 1881/4200
Processing 1901/4200
Processing 1921/4200
Processing 1941/4200
Processing 1961/4200
Processing 1981/4200
Processing 2001/4200
Processing 2021/4200
Processing 2041/4200
Processing 2061/4200
Processing 2081/4200
Processing 2101/4200
Processing 2121/4200
Processing 2141/4200
Processing 2161/4200
Processing 2181/4200
Processing 22

Input length of input_ids is 125, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 2461/4200
Processing 2481/4200
Processing 2501/4200


Input length of input_ids is 118, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 2521/4200
Processing 2541/4200
Processing 2561/4200
Processing 2581/4200
Processing 2601/4200
Processing 2621/4200
Processing 2641/4200
Processing 2661/4200
Processing 2681/4200
Processing 2701/4200
Processing 2721/4200
Processing 2741/4200
Processing 2761/4200


Input length of input_ids is 102, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 2781/4200
Processing 2801/4200
Processing 2821/4200
Processing 2841/4200
Processing 2861/4200
Processing 2881/4200
Processing 2901/4200
Processing 2921/4200
Processing 2941/4200
Processing 2961/4200
Processing 2981/4200


Input length of input_ids is 132, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 3001/4200
Processing 3021/4200
Processing 3041/4200
Processing 3061/4200
Processing 3081/4200
Processing 3101/4200
Processing 3121/4200
Processing 3141/4200
Processing 3161/4200
Processing 3181/4200
Processing 3201/4200
Processing 3221/4200
Processing 3241/4200
Processing 3261/4200
Processing 3281/4200
Processing 3301/4200
Processing 3321/4200
Processing 3341/4200
Processing 3361/4200
Processing 3381/4200
Processing 3401/4200
Processing 3421/4200
Processing 3441/4200


Input length of input_ids is 228, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 3461/4200
Processing 3481/4200
Processing 3501/4200
Processing 3521/4200
Processing 3541/4200
Processing 3561/4200
Processing 3581/4200
Processing 3601/4200
Processing 3621/4200
Processing 3641/4200


ZeroDivisionError: ignored

In [None]:
import numpy as np
rd = np.array(rd)
scd = np.array(scd)
lr = np.array(lr)
uw = np.array(uw)

In [None]:
print(round(np.mean(rd),2), round(np.std(rd),2))

0.55 0.37


In [None]:
print(round(np.mean(scd),2), round(np.std(scd),2))

5.26 8.77


In [None]:
print(round(np.mean(lr),2), round(np.std(lr),2))

3.01 1.44


In [None]:
print(round(np.mean(uw),2), round(np.std(uw),2))

0.59 0.22


## Baseline GPT2-large

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f49393684d0>

In [None]:
prompt = '(Below are ONLY lyrics from a rap song)\n'

In [None]:
rd = []
scd = []
lr = []
uw = []
i = 1
for first_line in first_lines[:1000]:
  if i % 20 == 1:
    print(f"Processing {i}/{len(first_lines)}")
  i += 1

  input_ids = tokenizer([prompt + first_line], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  o = o.replace(prompt, '')
  rd.append(lyrics.get_rhyme_density(o))
  scd.append(lyrics.get_syllable_count_difference(o))
  lr.append(lyrics.get_longest_rhyme(o))
  uw.append(lyrics.get_unique_words(o))

Processing 1/4200
Processing 21/4200
Processing 41/4200
Processing 61/4200
Processing 81/4200
Processing 101/4200
Processing 121/4200
Processing 141/4200
Processing 161/4200
Processing 181/4200
Processing 201/4200


Input length of input_ids is 116, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 221/4200
Processing 241/4200


Input length of input_ids is 125, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 261/4200
Processing 281/4200
Processing 301/4200


Input length of input_ids is 105, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 321/4200


Input length of input_ids is 133, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 341/4200
Processing 361/4200
Processing 381/4200
Processing 401/4200
Processing 421/4200
Processing 441/4200
Processing 461/4200


Input length of input_ids is 120, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 481/4200
Processing 501/4200
Processing 521/4200
Processing 541/4200
Processing 561/4200
Processing 581/4200
Processing 601/4200
Processing 621/4200
Processing 641/4200
Processing 661/4200
Processing 681/4200
Processing 701/4200
Processing 721/4200
Processing 741/4200
Processing 761/4200
Processing 781/4200
Processing 801/4200
Processing 821/4200
Processing 841/4200
Processing 861/4200
Processing 881/4200
Processing 901/4200


Input length of input_ids is 131, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Processing 921/4200
Processing 941/4200
Processing 961/4200
Processing 981/4200


In [None]:
import numpy as np
print(round(np.mean(rd),2), round(np.std(rd),2))

0.56 0.32


In [None]:
print(round(np.mean(scd),2), round(np.std(scd),2))

3.19 7.19


In [None]:
print(round(np.mean(lr),2), round(np.std(lr),2))

3.04 1.18


In [None]:
print(round(np.mean(uw),2), round(np.std(uw),2))

0.58 0.19


# Qualitative

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:
prompt = '(Below are ONLY lyrics from a rap song)\n'
torch.manual_seed(100)
for start in first_lines[:10]:
  input_ids = tokenizer([prompt + start], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  o = o.replace(prompt, '')
  print(o)
  print('=========')

godfathers in the house
and the house in me (above), I say it's too cold
I don't let nothing stay to me
the house in me, I can hear them calling in my chest
if I get lost on thursday (the day of the soul), he's there, right in my sights
the house in me, I'm in the mood for love
the house in me, so cold so
tell me what ya
what ya say nah
and he says hey
but the nigga tells me to wait
I ain't play nah
I ain't play wit dat nigga
I ain't play wit dat nigga
So I'm waiting
wait for tha sky to fall
wait for tha sky to fall
I don't gots no reason to keep
I don't gots no reason to keep

cut the music up
Just wait til the world stops
But wait till the world stops
I'm gonna tell you what's in my life,
How I made more than a hundred thousand dollars a week
I've been here before so many times
And my life is a thousand miles long
No no no, it's not like that,
I can't wait until I get out the door
Well, I'll be back
 calling all hustlers calling all players
calling all hustlers call
call all hustlers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)

In [None]:
prompt = '(Below are ONLY lyrics from a rap song)\n\n'
for start in first_lines[:10]:
  input_ids = tokenizer([prompt + start], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  o = o.replace(prompt, '')
  print(o)
  print('=========')

godfathers in the house
The two have a heart
the two have never talked

a true friendship and love
the two have never been lonely
And all that he cared for
A love has never been lost
A romantic home in my back yard
My mother had a heart
The two have never had that love
A home in my back yard
This year the two have two great children. They're also
tell me what ya
Tell me what ya
Tell me what you know
Tell me who's going to get your money, please
Tell me what you aren't
Tell me how big
Tell me who's going to buy your shit
Tell me how big
Tell me what you haven't
Tell me who's going to buy your shit
Tell me who's going to buy your shit
Tell me who's looking at
cut the music up
with the lyrics written by Eric Clapton as well as the lyrics written by Michael Jackson in the band The Cure.

The lyrics were posted on this page to the internet in an attempt to explain why Eric Clapton, the son and mentor of the rap artists, is not doing his best. But if you know him, here's how.
The source of t

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
model = AutoModelForCausalLM.from_pretrained('mydata/MyDrive/mlp/model').to(device)

In [None]:
for start in first_lines[:10]:
  input_ids = tokenizer([start], return_tensors="pt").to(device).input_ids
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=100,
    top_k=50,
    pad_token_id=tokenizer.eos_token_id
  )

  o = tokenizer.decode(sample_output[0], skip_special_tokens=True)
  print(o)
  print('=========')

godfathers in the house
and one day he gets a message from my brother named ron brown
i know you feel my pain
and they say im coming with the plan to make it so easy by
im leaving but you know i gotta give you a chance
i say i aint going down down but im looking up to my man
so i keep it good to see you smile in the night
and i love your smile
and i cant stop my tears
and i
tell me what ya
what you wanna do when you can go to sleep
oh oh oh oh
you know what i want
how you wanna
what you wanna do when you can go to sleep
oh oh oh oh oh yeah yeahi am an artist with a mission that makes me feel like im trapped in a cage
cause all the time i got to go and i be alone
theres a way that everybody knows
and i wonder where im going because im stuck in a cage
cut the music up
the beats is loud im sure to send them higher
dont go to sleep youre in the mind of the lord
youre not the one to see the god within within
and all this noise around my head youre the chosen
youre not the one to see me fall