<a href="https://colab.research.google.com/github/d-sanjukta/Lyrics-Generator/blob/main/3_Artist_specific_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## <b>3. <u>FINE TUNING TRAINED LLM (ARTIST SPECIFIC TRAINNIG)</b></u>

* We will fine-tune the genre-based language models on five selected artists for each specific genre.
* This process aims to capture the unique style and characteristics of each artist, enabling the generation of content that reflects their artistic expression and creativity.


______________________________________________________________________________________________________________

In [None]:
# !pip install transformers
# !pip install datasets
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel,pipeline
import re

In [None]:
# Mounting Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


### FUNCTIONS


In [None]:
## FUNCTION TO PRE_PROCESS THE DATA :

def pre_processing_lyrics(dataset):
  '''
  INPUT : Dataset having different features like : genre , artist , title , view , lyrics etc
  Here, we are interested only in lyrics.
  Output: Processed list of Songs/ lyrics
  '''
  # extracting lyrics from dataset
  try :
    songs  = list(dataset['lyrics'])
  except:
    songs  = list(dataset['Lyric'])


  # REMOVE ANY WORDS CONTAINING NUMERICAL VALUE and SPECIAL CHARACTERS.
  final_lyrics = []
  pattern = re.compile(r'\b[^\W\d_]+\b')  # Regex pattern to match words without numerical characters

  for lyric in songs:
        splits = str(lyric).split(',')
        cleaned_splits = []

        for split in splits:
            cleaned_split = re.sub(r'[^\w\s\']', '', split)
            filtered_split = ' '.join(word for word in cleaned_split.split() if pattern.match(word))
            cleaned_splits.append(filtered_split)

        final_lyrics.append(','.join(cleaned_splits))


  # Applygin 'next line' token at every capital Letter in lyrics and 'end of sequence' token.
  final_lyrics  = [' '.join(['\n' + word if (word[0]).isupper() else word for word in songs.replace('  ', '').split(' ')] + [' </s>']) for songs in final_lyrics]
  print (f"Number of Total Processed Lyrics : {len(final_lyrics)}")

  return final_lyrics

##############################################################################################################################################################################

# Function to make chunks of equal number of words: default = 128 words/tokens

def create_chunks(tokens , window = 128, stride = 20):
  chunks = [] # list to contains chunks of 128 words.
  start = 0
  end = window

  while end < len(tokens):
    chunks.append(tokens[start:end])
    start +=stride
    end +=stride

  return chunks

# Define the  function to take inputs and generate labels and masks.
def create_data(inputs):
    input_ids = inputs[:-1]
    target_ids = inputs[1:]

    return {'input_ids': input_ids, 'attention_mask': tf.ones_like(input_ids)}, target_ids

##################################################################################################################################################################

## FUNCTION TO PREPARE THE 'TENSORFLOW' DATASET:

def prepare_dataset(lyrics, model = 'gpt2',n_words = 128  ):
  '''
  Input : takes in the list of lyrics & the pre-trained tokenizer.
  Output: return the tensorflow datasets.
  '''

  # Split the lyrics into train and validation sets
  val_split = 0.2  # 20% of the data will be used for validation

  split_index = int(len(lyrics) * (1 - val_split))
  train_lyrics = lyrics[:split_index]
  val_lyrics = lyrics[split_index:]

  # Load the pre-trained GPT-2 tokenizer
  tokenizer = GPT2Tokenizer.from_pretrained(model)

  # Tokenize the train and validation lyrics
  encoded_train_lyrics = np.concatenate([tokenizer.encode(l) for l in train_lyrics if tokenizer.encode(l)])
  encoded_val_lyrics = np.concatenate([tokenizer.encode(l) for l in val_lyrics if tokenizer.encode(l)])

  # Creating chunks using the above defined function
  encoded_train_lyrics = create_chunks(encoded_train_lyrics)
  encoded_val_lyrics = create_chunks(encoded_val_lyrics)

  # Restricting the training and validation set to 5000 and 500 respetively. (due to system and time constraints)
  if len(encoded_train_lyrics) > 10000:
    encoded_train_lyrics = encoded_train_lyrics[:10000]
  if len(encoded_val_lyrics) > 500:
    encoded_val_lyrics = encoded_val_lyrics[:500]

  print (f'''
  Size of Training and Validation set:
  Training Size   : {len(encoded_train_lyrics)}
  Validation Size : {len(encoded_val_lyrics)}
  ''')

  # Prepare the tensorflow datasets
  train_dataset = tf.data.Dataset.from_tensor_slices(encoded_train_lyrics)
  val_dataset = tf.data.Dataset.from_tensor_slices(encoded_val_lyrics)

  train_dataset = train_dataset.map(create_data).shuffle(5000).batch(8)
  val_dataset = val_dataset.map(create_data).batch(8)

  return train_dataset, val_dataset

##############################################################################################################################################################################

## FUNCTION TO TRAIN THE MODEL:

def training_model(train_data, validation_data,path,
                   model = 'gpt2',
                   learning_rate = 1e-7,
                   n_epochs = 3):

  # Load the pre-trained GPT-2 small model
  model = TFGPT2LMHeadModel.from_pretrained(model)

  # Loading Weights from drive:
  model.load_weights(path)

  ## Seting up the mmodel:

  # Set up the optimizer and loss function
  optimizer = tf.keras.optimizers.Adam(learning_rate= learning_rate)
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  # Compile the model
  model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'], jit_compile = True)

  # Training Model:

  history  = model.fit(train_data, epochs = n_epochs, validation_data = validation_data)

  return history, model

### TRAINING BEGINS HERE

* We will fine tune all artist specific models for a particular Genre at once.

In [None]:
# Based on the EDA, the top popular artists from each genre:
genre_artist = {
    'Country' : ['Luke_Combs', 'Johnny_Cash', 'John_Denver', 'Dolly_Parton', 'Morgan_Wallen'],
    'RB'      : ['The_Weekend','Chris_Brown','Dua_Lipa','Ed_Sheeran','Justin_Bieber'],
    'Rock'    : ['Queen', 'The_Beatles', 'Pink_Floyd', 'Maroon5', 'Cold_Play'],
    'Misc'    : ['Scott_Cawthon','Emily_Dickinson', 'Robert_Burns'],
    'Pop'     : ['Taylor_Swift',  'Ariana_Grande', 'Rihanna', 'Ed_Sheeran', 'Lana_Del_Rey'],
    'Rap'     : ['Drake', 'Eminem', 'Kanye_West', 'Kendrick_Lamar', 'Nicki_Minaj']}

#### ARTISTS GENRE : COUNTRY

In [None]:
genre = 'Country'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")



  FINE-TUNING 'Country' PRE_TRAINED MODEL FOR ARTIST 'Luke_Combs' 
  
Number of Total Processed Lyrics : 67

  Size of Training and Validation set: 
  Training Size   : 1155
  Validation Size : 286
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Country' PRE_TRAINED MODEL FOR ARTIST 'Johnny_Cash' 
  
Number of Total Processed Lyrics : 797


Token indices sequence length is longer than the specified maximum sequence length for this model (1267 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Country' PRE_TRAINED MODEL FOR ARTIST 'John_Denver' 
  
Number of Total Processed Lyrics : 51

  Size of Training and Validation set: 
  Training Size   : 666
  Validation Size : 191
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Country' PRE_TRAINED MODEL FOR ARTIST 'Dolly_Parton' 
  
Number of Total Processed Lyrics : 598


Token indices sequence length is longer than the specified maximum sequence length for this model (1036 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Country' PRE_TRAINED MODEL FOR ARTIST 'Morgan_Wallen' 
  
Number of Total Processed Lyrics : 84

  Size of Training and Validation set: 
  Training Size   : 1676
  Validation Size : 290
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################


#### ARTISTS GENRE : RHYTHM & BLUES

In [None]:
genre = 'RB'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")



  FINE-TUNING 'RB' PRE_TRAINED MODEL FOR ARTIST 'The_Weekend' 
  
Number of Total Processed Lyrics : 192


Token indices sequence length is longer than the specified maximum sequence length for this model (1254 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 4435
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'RB' PRE_TRAINED MODEL FOR ARTIST 'Chris_Brown' 
  


Token indices sequence length is longer than the specified maximum sequence length for this model (1250 > 1024). Running this sequence through the model will result in indexing errors


Number of Total Processed Lyrics : 544

  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'RB' PRE_TRAINED MODEL FOR ARTIST 'Dua_Lipa' 
  
Number of Total Processed Lyrics : 247

  Size of Training and Validation set: 
  Training Size   : 3522
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'RB' PRE_TRAINED MODEL FOR ARTIST 'Ed_Sheeran' 
  
Number of Total Processed Lyrics : 296


Token indices sequence length is longer than the specified maximum sequence length for this model (1895 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'RB' PRE_TRAINED MODEL FOR ARTIST 'Justin_Bieber' 
  
Number of Total Processed Lyrics : 348


Token indices sequence length is longer than the specified maximum sequence length for this model (1148 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################


#### ARTISTS GENRE : ROCK

In [None]:
genre = 'Rock'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.AI/AI-3 Language Models/Project/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")



  FINE-TUNING 'Rock' PRE_TRAINED MODEL FOR ARTIST 'Queen' 
  
Number of Total Processed Lyrics : 475

  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rock' PRE_TRAINED MODEL FOR ARTIST 'The_Beatles' 
  
Number of Total Processed Lyrics : 557


Token indices sequence length is longer than the specified maximum sequence length for this model (1778 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rock' PRE_TRAINED MODEL FOR ARTIST 'Pink_Floyd' 
  
Number of Total Processed Lyrics : 278

  Size of Training and Validation set: 
  Training Size   : 2531
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rock' PRE_TRAINED MODEL FOR ARTIST 'Maroon5' 
  
Number of Total Processed Lyrics : 197

  Size of Training and Validation set: 
  Training Size   : 3029
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rock' PRE_TRAINED MODEL FOR ARTIST 'Cold_Play' 
  
Number of Total Processed Lyrics : 223

  Size of Training and Validation set: 
  Training Size   : 2719
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################


#### ARTISTS GENRE : POP

In [None]:
genre = 'Pop'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.ai/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")



  FINE-TUNING 'Pop' PRE_TRAINED MODEL FOR ARTIST 'Taylor_Swift' 
  
Number of Total Processed Lyrics : 479


Token indices sequence length is longer than the specified maximum sequence length for this model (2909 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Pop' PRE_TRAINED MODEL FOR ARTIST 'Ariana_Grande' 
  
Number of Total Processed Lyrics : 252


Token indices sequence length is longer than the specified maximum sequence length for this model (1416 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Pop' PRE_TRAINED MODEL FOR ARTIST 'Rihanna' 
  
Number of Total Processed Lyrics : 405


Token indices sequence length is longer than the specified maximum sequence length for this model (1176 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Pop' PRE_TRAINED MODEL FOR ARTIST 'Ed_Sheeran' 
  
Number of Total Processed Lyrics : 281


Token indices sequence length is longer than the specified maximum sequence length for this model (1216 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Pop' PRE_TRAINED MODEL FOR ARTIST 'Lana_Del_Rey' 
  
Number of Total Processed Lyrics : 436


Token indices sequence length is longer than the specified maximum sequence length for this model (1185 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################


#### ARTISTS GENRE : RAP

In [None]:
genre = 'Rap'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.AI/Project AI 3 NLP/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.AI/Project AI 3 NLP/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.AI/Project AI 3 NLP/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.AI/Project AI 3 NLP/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")



  FINE-TUNING 'Rap' PRE_TRAINED MODEL FOR ARTIST 'Drake' 
  
Number of Total Processed Lyrics : 426


Token indices sequence length is longer than the specified maximum sequence length for this model (1184 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 10000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rap' PRE_TRAINED MODEL FOR ARTIST 'Eminem' 
  
Number of Total Processed Lyrics : 496


Token indices sequence length is longer than the specified maximum sequence length for this model (1487 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 10000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rap' PRE_TRAINED MODEL FOR ARTIST 'Kanye_West' 
  
Number of Total Processed Lyrics : 773


Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 10000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rap' PRE_TRAINED MODEL FOR ARTIST 'Kendrick_Lamar' 
  
Number of Total Processed Lyrics : 326


Token indices sequence length is longer than the specified maximum sequence length for this model (1109 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 10000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Rap' PRE_TRAINED MODEL FOR ARTIST 'Nicki_Minaj' 
  
Number of Total Processed Lyrics : 323


Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 7054
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################


#### ARTISTS GENRE : MISCELLANEOUS

In [None]:
genre = 'Misc'
artists = genre_artist[genre]

for artist in artists:
  print (f'''
  FINE-TUNING '{genre}' PRE_TRAINED MODEL FOR ARTIST '{artist}'
  ''')

  ## Getting the dataset:
  try:
    # Trying for pickle dataset
    try:
      with open(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.pickle', 'rb') as f:
        df = pickle.load(f)
    # Trying for pandas dataset
    except:
        df = pd.read_csv(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Datasets/Artist_Dataset/{genre}/{artist}.csv')
  except:
    df = None
    print (f'File not found for artist : {artist}')
    break
  ###############################################################################################################################################################

  # Data Pre-Processing
  songs  = pre_processing_lyrics(df)

  # Preparing Tensorflow Dataset
  train_data , val_data = prepare_dataset(songs)

  # Training Model:
  path = f'/content/drive/MyDrive/UNIV.ai/Project Landing /Saved Models/Genre Models/{genre}_model_weights.h5'

  _, model = training_model(train_data , val_data , path)

  ################################################################################################################################################################

  # saving model as .h5 file

  model.save_weights(f'/content/drive/MyDrive/UNIV.ai/Project Landing /Saved Models/Artist Models/{genre}_models/{artist}_weights.h5')

  print ("############################################################################################################################################################################################")


Token indices sequence length is longer than the specified maximum sequence length for this model (1879 > 1024). Running this sequence through the model will result in indexing errors



  FINE-TUNING 'Misc' PRE_TRAINED MODEL FOR ARTIST 'Scott_Cawthon' 
  
Number of Total Processed Lyrics : 87

  Size of Training and Validation set: 
  Training Size   : 2012
  Validation Size : 327
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Misc' PRE_TRAINED MODEL FOR ARTIST 'Emily_Dickinson' 
  
Number of Total Processed Lyrics : 1167


Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################

  FINE-TUNING 'Misc' PRE_TRAINED MODEL FOR ARTIST 'Robert_Burns' 
  
Number of Total Processed Lyrics : 544


Token indices sequence length is longer than the specified maximum sequence length for this model (1310 > 1024). Running this sequence through the model will result in indexing errors



  Size of Training and Validation set: 
  Training Size   : 5000
  Validation Size : 500
  


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
Epoch 3/3
############################################################################################################################################################################################
