In [1]:
import pandas as pd
import spacy
nlp=spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {"ya","ai","™"}
from collections import Counter
from sylcofunc import sylco
import datasets
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline
df=pd.read_csv('data/lyrics-data.csv')

In [2]:
def getLyricsFromArtist(df,artist,string=True):
    """
    Collects songlyrics from artist in dfS

    :param dfS: Song dataframe to use
    :param artist: Name of artist e.g "Bruno Mars"
    :return: lyric
    """
    artistlink='/'+artist.lower().replace(' ','-')+'/'
    lyric=df.loc[df['ALink']==artistlink]['Lyric']
    if(not string):
        return lyric
    else:
        return ' '.join(lyric.to_list())


## The Class for Lyric Analysis


In [3]:
class LyricAnalysis:
    def __init__(self,lyric):
        self.lyric=lyric
        self.doc=nlp(lyric)
        #Named entities
        self.ents=[(ent.text, ent.label_) for ent in self.doc.ents]
        #NP chunks
        self.nounchunks=[(chunk.text.strip(), chunk.root.tag_) for chunk in self.doc.noun_chunks if chunk.text not in [ent[0] for ent in self.ents] and chunk.text.lower() not in nlp.Defaults.stop_words]
        (self.nostopchunks,self.chunkDict)=self.chunksToDict()
    
    def getEnts(self,label):
        return [ent for (ent,entlabel) in self.ents if entlabel==label]
    
    def chunksToDict(self):
        """
        Turns a list of nounchunks in to a dictionary that sorts them in to stopwords and a list of noun chunks that do not start with stopwords.
        
        For example:
        nounchunks=[('a town', 'NN'), ('a place', 'NN'), ('movie scenes', 'NNS'), ('Noise', 'NNP'), ('the streets', 'NNS')]
        
        returns:
        [('movie scenes', 'NNS'), ('Noise', 'NNP')]
        {'a': [('town', 'NN'), ('place', 'NN')], 'the': [('streets', 'NNS')]}
        """
        nostopchunks=[]
        chunkDict={}
        for chunk in self.nounchunks:
            chunkText=chunk[0]
            hasStop=False
            doc=nlp(chunkText)
            for i in range(len(doc)):
                if(doc[i].is_stop):
                    hasStop=True
                else:
                    if(hasStop):
                        stopwords=doc[:i].__str__()
                        rest=doc[i:].__str__()
                        if stopwords in chunkDict:
                            chunkDict[stopwords].append((rest,chunk[1]))
                        else:
                            chunkDict[stopwords]=[(rest,chunk[1])]
                    break
            if(not hasStop):
                nostopchunks.append(chunk)
        return nostopchunks,chunkDict

        


## Function for swap of entities


In [4]:
def findRhyme(word,possibleswaps):
    """
    Return a word from possibleswaps that has the equal amount of syllables as the parameter word
    """
    for possible in possibleswaps:
        if(sylco(possible)==sylco(word)):
            return possible
    return word

In [5]:
def swapEnts(fromSong,toArtist,entLabels):
    """
    Swaps the most common entities in fromSong with the most common entities in toArtist.
    Does so with the entities with label in entLabels

    :param entLabels: list of entity labels to make the swap with
    :return: str of new song
    """
    output=fromSong.lyric
    changes=[]
    used=[]
    for label in entLabels:
        froments= [ent[0] for ent in Counter(fromSong.getEnts(label)).most_common()] #This sorts by frequency. Ent[0] is the text, ent[1] is the count
        toents= [ent[0] for ent in Counter(toArtist.getEnts(label)).most_common()]
        for ent in froments:
            newEnt=findRhyme(ent,[ent for ent in toents if ent not in used])
            output=output.replace(ent,newEnt)
            used.append(newEnt)
            changes.append((ent,newEnt))
    return (output,changes)

## Function for swap of noun phrases


In [7]:
# Finds a word with equal syllable count that also has the same tag
def findMatch(word,tag,list):
    for (lword, ltag) in list:
        if sylco(lword)==sylco(word) and tag==ltag:
            return lword, ltag  
    return word, ltag

In [6]:
def swapNpsWithStop(fromSong,toArtist):
    #neededNps={key : list(toArtist.chunkDict[key]) for key in fromSong.chunkDict.keys()}
    #neededNps is the collection of noun chunks that start with the same stopwords one of the chunks in the song.
    neededNps={}
    needtopop=[]
    for key in fromSong.chunkDict.keys():
        try:
            neededNps[key]=list(toArtist.chunkDict[key])
        except:
            #So that the algorithm doesnt try to search for missing noun phrases
            needtopop.append(key) 
    
    #Not able to do this in upper loop because of change in dict size in loop error
    for key in needtopop:
        fromSong.chunkDict.pop(key)
    
    changes=[]
    output=fromSong.lyric
    for key in fromSong.chunkDict.keys():
        #sort by freq
        cf=[obj[0] for obj in Counter(fromSong.chunkDict[key]).most_common()]
        ct=[obj[0] for obj in Counter(neededNps[key]).most_common()]
        for (word,tag) in cf:
            newWord, newTag=findMatch(word,tag,ct)
            newWord=newWord.replace('\n',' ')
            output=output.replace(f'{key} {word}',f'{key} {newWord}')
            try:
                ct.remove((newWord,newTag))
            except:
                pass
            changes.append((f'{key} {word}',f'{key} {newWord}'))
    return output,changes

In [8]:
def swapNps(fromSong,toArtist):
    changes=[]
    output=fromSong.lyric
    #sort by freq
    cf=[chunk[0] for chunk in Counter(fromSong.nostopchunks).most_common()]
    ct=[chunk[0] for chunk in Counter(toArtist.nostopchunks).most_common()]
    for (word,tag) in cf:
        newWord, newTag=findMatch(word,tag,ct)
        newWord=newWord.replace('\n',' ')
        output=output.replace(word,newWord)
        try:
            ct.remove((newWord,newTag))
        except:
            pass
        changes.append((word,newWord))
    return output,changes

## The swap of verbphrases


## Preparing the dataset for model


In [9]:
def getPhrasesFromArtist(artist):
    """
    Collects songlyrics from artist in df

    :param df: Song dataframe to use
    :param artist: Name of artist e.g "Bruno Mars"
    :return: list of lines in songs of artist
    """
    artistlink='/'+artist.lower().replace(' ','-')+'/'
    lyric=df.loc[df['ALink']==artistlink]['Lyric']
    output=[]
    for song in lyric:
        phraselist=song.split('\n')
        for phrase in phraselist:
            output.append(phrase)
    return output

In [10]:
# block_size = tokenizer.model_max_length
block_size = 128

In [11]:
#Code from https://huggingface.co/docs/transformers/tasks/language_modeling#preprocess
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

## Masked language modeling


In [12]:
from huggingface_hub import notebook_login
username='davidlekve'
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [15]:
def trainNewModel(artist):
    lyric={'text': getPhrasesFromArtist(df,artist)}
    ds=datasets.Dataset.from_dict(lyric)
    ds=ds.train_test_split(test_size=0.1)

    tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=1,
    )

    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

    model_name = model_checkpoint.split("/")[-1]

    training_args = TrainingArguments(
        f"{model_name}-finetuned-{artist.lower().replace(' ','-')}",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        push_to_hub=True,
    )
    #num_train_epochs


    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
        data_collator=data_collator,
    )
    # trainer = Trainer(min_epochs=1)

    trainer.train()
    trainer.push_to_hub()


## Changing the verbs

In [16]:
#Function that changes a word. Replace is not used to minimize changing the middle of a word.
def replaceWord(doc,word,newWord):
    result = ""
    changed=False
    for token in doc:
        if token.text != word:
            result += token.text_with_ws
        elif (not changed):
            result += newWord
            result += token.whitespace_
            changed=True
        else:
            result += token.text_with_ws
    return result

In [17]:
def changeVerbs(song,generator):
    phraselist=song.split('\n')
    verbtags=["VB","VBD","VBG","VBN","VBP","VBZ"]
    newText=''
    changes=[]
    for phrase in phraselist:
        verb=None
        doc=nlp(phrase)
        for word in doc:
            if(word.tag_ in verbtags and word.text.lower() not in nlp.Defaults.stop_words):
                verb=word.text
                break
        
        if(verb!=None):
            
            modified=replaceWord(doc,verb,'<mask>')
            results=generator(modified)
            wordresults=[result['token_str'].strip() for result in results]
            newword=findRhyme(verb,wordresults)
            try:
                index=wordresults.index(newword)
                newphrase=results[index]['sequence'].strip()
                changes.append((verb,newword))
            except:
                newphrase=phrase
                
            newText+=newphrase+'\n'
        else:
            newText+=phrase+'\n'
    return newText,changes     
    

## Generating new Lyrics

In [18]:
#Function that runs through all the necessary code to change the song.
def generate(artistsong):
    changes=[]
    entsSwapped=swapEnts(artistsong.songAnalysis,artistsong.artistAnalysis,artistsong.entlabels)
    artistsong.entresults=entsSwapped
    changes.extend(entsSwapped[1])
    
    nounSwapped=swapNps(artistsong.songAnalysis,artistsong.artistAnalysis)
    changes.extend(nounSwapped[1])

    stopnounSwapped=swapNpsWithStop(artistsong.songAnalysis,artistsong.artistAnalysis)
    lyric=stopnounSwapped[0]

    for i in range(len(changes)):
        lyric=lyric.replace(changes[i][0],changes[i][1])

    changes.extend(stopnounSwapped[1])
    artistsong.nounresutls=(lyric,[change for change in changes if change not in entsSwapped[1]])
    
    verbSwapped=changeVerbs(lyric,artistsong.generator)
    artistsong.verbresult=verbSwapped
    changes.extend(verbSwapped[1])
    artistsong.changes=changes


In [19]:
#Object that stores all the information
class GeneratedLyricsFromArtist:
    def __init__(self,artist,song):
        self.artistName=artist
        self.artistLyric=getLyricsFromArtist(df,self.artistName)
        self.artistAnalysis=LyricAnalysis(self.artistLyric)
        self.songTitle=song
        self.entlabels=['PERSON','FAC','ORG','GPE','LOC']
        self.songAnalysis=self.getSongAnalysis()
        self.generator=self.getModel()
        self.entresults=None
        self.nounresutls=None
        self.verbresult=None
        self.changes=None

    def getSongAnalysis(self):
        lyric=df.loc[df['SName']==self.songTitle]['Lyric'].head(1).values[0]
        return LyricAnalysis(lyric)

    def getModel(self):
        try:
            model=AutoModelForMaskedLM.from_pretrained(f"{username}/distilroberta-base-finetuned-{self.artistName.lower().replace(' ','-')}")
            return pipeline('fill-mask', model = model, tokenizer=tokenizer)

        except:
            trainNewModel(self.artistName)
            model=AutoModelForMaskedLM.from_pretrained(f"{username}/distilroberta-base-finetuned-{self.artistName.lower().replace(' ','-')}")
            return pipeline('fill-mask', model = model, tokenizer=tokenizer)

## The running of Code:

Some possible artists are:

Kendrick Lamar,
The Beatles,
Bruno Mars,
Billy Ray Cyrus

Some possible songs are:
'Empire State Of Mind (part. Ii)','Bohemian Rhapsody','Hotline Bling'

To get the results, make an GeneratedLyricsFromArtist object with a selected song and artist.
Run generate with the object as parameter
Print different results

object.verbresult[0] is the final result of the song

An example is:

In [21]:
kendricklamar=GeneratedLyricsFromArtist('Kendrick Lamar','Empire State Of Mind (part. Ii)')

In [22]:
generate(kendricklamar)

In [50]:
kendricklamar.verbresult[0]

"money, fuckin (2x)\n\ngrew up in a fan,\nThat is famous as a world of picket signs\nBitch is always loud\nThere are people all around\nAnd the streets are mean\nIf I could make it here\nI could make it anywhere\nThat's what they say\nget my vibe in hoes\nOr my name in Things going down Metro\n\nEven if it ain't all it seems\nI'm a traffic jam of bitches\nBaby, I'm from\n\n(Compton)\nfuckin, everybody where bitches are made of\nThere's nothing you can't do\nNow you're in fuckin\nThese streets will make you feel Blow Hurt people will inspire you\ndo it for fuckin, fuckin, fuckin\n\nOn the city, there ain't never a nigga\nproblems, so hard\nSuch a melting pot on the mirror selling life\nBabies pray to God\nGod a swimming pool\ntake me down from Nile to the rough Compton\nSomeone was tonight with a woman\nFor more than from an empty fridge\n\nI'm not to make it by any means\nI'm a traffic jam of bitches\nBaby, I'm from\n\n(Compton)\nfuckin, everybody where bitches are made of\nThere's not

In [51]:
kendricklamar.changes

[('Grew', 'don'),
 ('Broadway', 'Metro'),
 ('the Brooklyn Bridge\nSomeone', 'Long Beach Boulevard\nFlagging'),
 ('Chorus', 'Compton'),
 ('New York', 'fuckin'),
 ('Harlem', 'Nile'),
 ('dreams', 'bitches'),
 ('concrete jungle', 'everybody'),
 ('brand', 'Blow'),
 ('new\nBig lights', 'Hurt people'),
 ('Oooh oooh', 'money'),
 ('movie scenes', 'picket signs'),
 ('Noise', 'Bitch'),
 ('sirens', 'people'),
 ('lights', 'hoes'),
 ('marquees', 'Things'),
 ('Ladies', 'problems'),
 ('rock', 'life'),
 ('Preachers', 'Babies'),
 ('Hail', 'God'),
 ('big dreams', 'dollars'),
 ('Everybody', 'communication'),
 ('a pocketful', 'a traffic jam'),
 ('a town', 'a fan'),
 ('a place', 'a world'),
 ('a curfew', 'a nigga'),
 ('a gypsy cab', 'a swimming pool'),
 ('a hunger', 'a woman'),
 ('the air', 'the world'),
 ('the streets', 'the streets'),
 ('the avenue', 'the city'),
 ('the corner', 'the mirror'),
 ('the Brooklyn Bridge', 'the rough Compton'),
 ('the big city\nStreet lights', 'the illuminati'),
 ('the world',

In [28]:
billyray=GeneratedLyricsFromArtist('Billy Ray Cyrus','Empire State Of Mind (part. Ii)')

In [29]:
generate(billyray)

In [30]:
billyrayHotline=GeneratedLyricsFromArtist('Billy Ray Cyrus','Hotline Bling')

In [31]:
generate(billyrayHotline)

In [44]:
kendricklamar.entresults[1]

[('Grew', 'don'),
 ('Broadway', 'Metro'),
 ('the Brooklyn Bridge\nSomeone', 'Long Beach Boulevard\nFlagging'),
 ('Chorus', 'Compton'),
 ('New York', 'fuckin'),
 ('Harlem', 'Nile')]

In [49]:
billyrayHotline.entresults[1]

[]