In [1]:
%cd ..
%load_ext autoreload
%autoreload 2

/home/dasol/userdata/AI_music_generation_challenge


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from pathlib import Path
from pyabc import pyabc
from tqdm import tqdm

In [3]:
dir_path = "dataset/the_session/"
dir_path = Path(dir_path)
abc_list = list(dir_path.rglob('*.abc')) + list(dir_path.rglob('*.ABC'))

In [4]:
from urllib import request
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
title_dict = defaultdict(list)

title_list = []
for page in tqdm(range(1, 1963)):
    url = 'https://thesession.org/tunes/search?type=&mode=&q=&page=' + f'{page}'
    request = requests.get(url)
    soup = BeautifulSoup(request.content, features='html.parser')
    request.close()

    find_title = soup.find('ol', attrs={'class':'manifest-inventory split'})
    for tune in find_title.findAll('li', attrs={'class':'manifest-item'}):
        title = tune.find('a', attrs={'class':'manifest-item-title'}).text
        title_dict['titles'].append(title)

100%|██████████| 1962/1962 [1:22:17<00:00,  2.52s/it]


In [5]:
torch.save(title_dict, '/home/dasol/userdata/AI_music_generation_challenge/dasol_notebooks/session_title.pt')

In [12]:
session_pt = torch.load('/home/dasol/userdata/AI_music_generation_challenge/dasol_notebooks/session_title.pt')
session_titles = list(session_pt['titles'])
session_titles[:10]

['Drowsy Maggie',
 'The Kesh',
 'Cooley’s',
 'The Butterfly',
 'Morrison’s',
 'The Silver Spear',
 'The Maid Behind The Bar',
 'The Banshee',
 'Banish Misfortune',
 'Out On The Ocean']

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Embedding(50259, 1024)

In [45]:
session_titles

['Drowsy Maggie',
 'The Kesh',
 'Cooley’s',
 'The Butterfly',
 'Morrison’s',
 'The Silver Spear',
 'The Maid Behind The Bar',
 'The Banshee',
 'Banish Misfortune',
 'Out On The Ocean',
 'The Wind That Shakes The Barley',
 'Tam Lin',
 'The Musical Priest',
 'The Connaughtman’s Rambles',
 'The Rights Of Man',
 'The Blarney Pilgrim',
 'King Of The Fairies',
 'The Harvest Home',
 'The Kid On The Mountain',
 'The Gravel Walks',
 'The Swallowtail',
 'The Mason’s Apron',
 'The Sally Gardens',
 'The Lilting Banshee',
 'Sí Beag Sí Mór',
 'The Lark In The Morning',
 'The Mountain Road',
 'Saint Anne’s',
 'The Boys Of Bluehill',
 'The Cliffs Of Moher',
 'Calliope House',
 'The Star Of Munster',
 'The Merry Blacksmith',
 'The Cup Of Tea',
 'The Congress',
 'Off To California',
 'Tripping Up The Stairs',
 'The Morning Dew',
 'Toss The Feathers',
 'The Bucks Of Oranmore',
 'Inisheer',
 'The Irish Washerwoman',
 'The Wise Maid',
 'John Ryan’s',
 'Miss McLeod’s',
 'A Fig For A Kiss',
 'The Pigeon On T

In [15]:
class TTLDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [27]:
max_length = max([len(tokenizer.encode(title)) for title in session_titles])
dataset = TTLDataset(session_titles, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [29]:
len(train_dataset), max_length

(17658, 29)

In [30]:
tokenizer.decode(train_dataset[3][0])

'<|startoftext|> Grant’s Rant<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>'

In [31]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=5, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [32]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 17658
  Num Epochs = 5
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 8830


Step,Training Loss
100,2.119
200,0.9451
300,0.8941
400,0.8734
500,0.8704
600,0.865
700,0.8711
800,0.876
900,0.857
1000,0.8415


Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json
Model weights saved in ./results/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8830, training_loss=0.6173514020537466, metrics={'train_runtime': 1007.1254, 'train_samples_per_second': 87.665, 'train_steps_per_second': 8.768, 'total_flos': 4644247158743040.0, 'train_loss': 0.6173514020537466, 'epoch': 5.0})

In [33]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=30, top_p=0.95, temperature=1.9, num_return_sequences=100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [34]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0:  igeonian
1:  ’Widowinden
2:  ’Crow Row Doth Arrive
3: ????? O �er Bread
4:  ier M. Holland
5:  ____________
6:  icky Irish
7:    W Geordie Y Fuwch
8:  ______________ Side 1 ______________ Side 2
9:  iddow
10:  ́Wearmouth Cottage
11:  eryanshet  Dhomhone
12:  xtreme étaitienne
13:  Âñeedà Sgé Rámsés
14:  _____________________________________________________
15:  ____ Munch
16:  ’Uvin Bair
17:  ich Bin Jeh Sion Isengard Isogen
18:  eryn Terracotta Hilltop
19:  ét Fête Aux Briandntistriens
20:   Sliabh Dave
21:   Hungarian Cottage
22:  ’n Coleman A
23:   Errigal Eadhte Gledherty
24:  Â Stenson’s
25:   Søren Kier
26:  ’Toir I bEdgar
27:    Mrs Flanagan Of Tulse Ten Minutes ’ Seven
28: ???? A Tribute From David
29:  _____, But I Can’t Dance
30: ?????
31:  ́ireann Smithe An Culyek An Øllagon
32:  ********, A Child’s Welcome From America
33:  ________ The Farmer Boy
34:  était Drummond
35:  ________________
36:   Dn y Wuallwynyn O Briyn
37:    The Reelin Horn
38:  _____ North
39:  ige A C

In [43]:
prompt_text = ' '

encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt").cuda()
output_sequences = model.generate(
    input_ids=encoded_prompt,
    max_length=max_length,
    temperature=0.7,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    do_sample=True,
    num_return_sequences=100,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [44]:
for i, output in enumerate(output_sequences):
    print("{}: {}".format(i, tokenizer.decode(output, skip_special_tokens=True)))

0:  The Blue Shutters
1:  Gan Ainm
2:  The Purple Wisent
3:  The Blue Ball
4:  The Kilkenny Girl
5:  Reel For Angus
6:  The Trip To Kilmurry
7:  Reel Des Ti-Minous
8:  The Hen’s March To Winter
9:  Molly From Monro
10:  The Fairy’s Dream
11:  The Ballymun Regatta
12:  Saratoga
13:  Trip To Kilkenny
14:  The Post Road To Boston
15:  The Doon-Hampstead
16:  Sylvain Barou’s
17:  The Hags Money
18:  The Wishing Tree
19:  The Road To Sligo
20:  O’Connor’s
21:  Maggie’s
22:  The Old Grey Goose
23:  The New Brig Of Glasgow
24:  The Piper’s Picnic
25:  Byrne’s
26:  The Crooked Stovepipe
27:  The Barnacle
28:  The Galway Farmer
29:  The Rat In The Kitchen
30:  Bwlch Llanberis
31:  The Monoceros
32:  Hommage A Guy Thomas
33:  The Old Woman From The Glen
34:  The Rambuck’s
35:  The Lads Of Alnwick
36:  The Fox In The Henhouse
37:  The Maids Of Ardagh
38:  The Mighty Turtledove
39:  The Whirlpool
40:  The Streamstown
41:  A Tune For You, Dear One
42:  The New York
43:  The Merry Maids Of Fife
44: 