### Install Dependencies 

In [None]:
!pip install tokenizers
!pip install transformers[torch]
!pip install accelerate

### KILL CODE

In [None]:
# import os
# os._exit(00)

### CHECK TO MAKE SURE YOU GOT A GPU

In [2]:
import torch
torch.cuda.is_available()

True

#### VERSION CHECK

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

### Connect to GDrive 

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Lets make some folders

In [None]:
# !mkdir -p /content/drive/MyDrive/models/xmas/tokenizer

Converting Your CSV to JSON

In [None]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('../data/All_Playlists_Combined.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = '../data/All Playlists Combined.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)

Helper Functions to normalize your data

In [17]:
import re
def remove_special_characters_and_spaces(input_string):
    # Define a regular expression pattern to match special characters and spaces
    pattern = r'[^a-zA-Z0-9]+'  # This pattern will keep only letters and digits

    # Use the sub method to replace matches of the pattern with an empty string
    clean_string = re.sub(pattern, '', input_string)

    return clean_string

def remove_special_characters(input_string):
    # Define a regular expression pattern to match special characters and spaces
    pattern = r'[^a-zA-Z0-9\s]+'  # This pattern will keep only letters and digits

    # Use the sub method to replace matches of the pattern with an empty string
    clean_string = re.sub(pattern, '', input_string)

    return clean_string

### Proto Prompting 

In [18]:
from traitlets import traitlets
from dateutil.parser import parse
from datetime import datetime
import json
stats_file = "../data/All_Playlists_Combined.json"
lines = []
with open(stats_file, 'r') as f:
    xmas_songs = json.load(f)
    for song in xmas_songs:
        title = remove_special_characters(song['track_name'])
        lyrics = re.sub(r'\n', ' ', song['lyrics'])
        lines.append(f"<s>##TITLE {title} ###LYRICS {lyrics} </s>\n")

    with open(f'../data/All_Playlists_Combine.txt', 'w', encoding='utf-8') as f:
          f.writelines(lines)
          f.close()

#### Training a BPE Tokenizer 

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from pathlib import Path
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
    ])

tokenizer.train(files=["../data/All_Playlists_Combine.txt"], trainer=trainer)
tokenizer.save("../models/xmas/tokenizer.json")

output = tokenizer.encode("Sleigh bells ring are you listening")
print(output.tokens)

['Sleigh', 'bells', 'ring', 'are', 'you', 'listening']


In [7]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=12306,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [8]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast(tokenizer_file="../models/xmas/tokenizer.json")

In [9]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
model.num_parameters()

52979730

In [19]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../data/All_Playlists_Combine.txt",
    block_size=128,
)

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [25]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/xmas/",
    overwrite_output_dir=True,
    num_train_epochs=500,
    per_device_train_batch_size=64,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train(resume_from_checkpoint='../models/xmas/')

In [None]:
trainer.save_model("../models/xmas/")

In [21]:
trainer.train()

Step,Training Loss
500,5.5848
1000,4.2596
1500,3.5162
2000,3.1308


TrainOutput(global_step=2000, training_loss=4.122851196289062, metrics={'train_runtime': 1460.1922, 'train_samples_per_second': 84.989, 'train_steps_per_second': 1.37, 'total_flos': 4110973722777600.0, 'train_loss': 4.122851196289062, 'epoch': 100.0})

In [None]:
trainer.train(resume_from_checkpoint='../models/xmas/')

In [27]:
trainer.save_model("../models/xmas/")

In [None]:
import json
import matplotlib.pyplot as plt
with open("../models/xmas/checkpoint-6000/trainer_state.json", "r") as f:
  data = json.load(f)

  params = {'legend.fontsize': 'small',
          'figure.figsize': (15, 10),
          'axes.labelsize': 'x-small',
          'axes.titlesize':'x-small',
          'xtick.labelsize':'x-small',
          'ytick.labelsize':'x-small'}
  plt.rcParams.update(params)

  loss_value = []
  for tick in data['log_history']:
      if 'loss' in tick:
          loss_value.append(tick['loss'])

  plt.plot(range(0, len(loss_value)), loss_value, label=f'loss', alpha=0.15)
  plt.savefig(f"../models/xmas/loss.jpg")
  plt.show()

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="../models/xmas/checkpoint-7800",
    tokenizer=tokenizer,
    top_k=20,
)

fill_text = pipeline(
    "text-generation",
    model="../models/xmas/checkpoint-7800",
    tokenizer=tokenizer
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [None]:
fill_text("##TITLE Rockin Around The Christmas Tree ###LYRICS ")