In [None]:
import numpy as np

# **Loading JSON data file**

In [1]:
import json
with open(r'data.json', 'r') as file:
    raw_data = json.load(file)

print(raw_data[0].keys())

dict_keys(['track', 'artist', 'album_type', 'album_name', 'album_artist', 'duration', 'listener_count', 'play_count', 'popularity', 'genre', 'age', 'explicit', 'thumbnail', 'song_url', 'summary', 'lyrics', 'in_movie', 'movie_name', 'youtube_link', 'language', 'sentiment', 'tempo', 'melspectrogram'])


### **Removing non required fields from the json**

In [2]:
to_remove = ['song_url', 'summary', 'tempo', 'melspectrogram', 'thumbnail', 'movie_name']
for i in raw_data:
    for j in to_remove:
        i.pop(j)

In [3]:
print(raw_data[0].keys())

dict_keys(['track', 'artist', 'album_type', 'album_name', 'album_artist', 'duration', 'listener_count', 'play_count', 'popularity', 'genre', 'age', 'explicit', 'thumbnail', 'lyrics', 'in_movie', 'movie_name', 'youtube_link', 'language', 'sentiment'])


In [4]:
raw_data[0]['explicit']

False

# **Extracting Metadata**

In [None]:
def extract_metadata(record):
    metadata = []
    metadata.append(0 if record['album_type'] == 'single' else 1)
    metadata.append(record['duration']) 
    metadata.append(record['listener_count'])
    metadata.append(record['play_count'])
    metadata.append(record['popularity'])
    metadata.append(record['age'])
    metadata.append(0 if record['explicit'] == False else 1)
    metadata.append(0 if record['in_movie'] == False else 1)
    metadata.append(i for i in record['language'])

    return np.asarray(metadata)

### **Creating a separate field for genre and sentiment**
This will be given the highest weightage since they affect the model most

In [None]:
def genre_and_sentiment(model, record):
    semantics = []
    semantics.append(i for i in record['sentiment'])
    
    

# **Extracting the lyrical features**

### **Custom features**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def extract_topics(lyrics, num_topics=5, num_words=10):
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(lyrics)
    lda = LatentDirichletAllocation(n_components=num_topics)
    lda.fit(dtm)

    topics = []
    for _, topic in enumerate(lda.components_):
        words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
        topics.append(words)
    return topics

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(lyrics):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(lyrics)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

In [None]:
import textstat

def readability_scores(lyrics):
    scores = []
    for lyric in lyrics:
        scores.append({
            'flesch_reading_ease': textstat.flesch_reading_ease(lyric),
            'smog_index': textstat.smog_index(lyric),
            'flesch_kincaid_grade': textstat.flesch_kincaid_grade(lyric),
            'coleman_liau_index': textstat.coleman_liau_index(lyric),
            'automated_readability_index': textstat.automated_readability_index(lyric),
            'dale_chall_readability_score': textstat.dale_chall_readability_score(lyric),
            'difficult_words': textstat.difficult_words(lyric),
            'linsear_write_formula': textstat.linsear_write_formula(lyric),
            'gunning_fog': textstat.gunning_fog(lyric)
        })
    return scores

In [None]:
def ngram_frequency(lyrics, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    ngram_matrix = vectorizer.fit_transform(lyrics)
    feature_names = vectorizer.get_feature_names_out()
    return ngram_matrix, feature_names

In [None]:
import spacy

def figurative_language(lyrics):
    nlp = spacy.load("en_core_web_sm")
    figurative_features = []
    for lyric in lyrics:
        doc = nlp(lyric)
        metaphors = [ent.text for ent in doc.ents if ent.label_ == 'METAPHOR']
        similes = [ent.text for ent in doc.ents if ent.label_ == 'SIMILE']
        figurative_features.append({
            'metaphors': metaphors,
            'similes': similes
        })
    return figurative_features

### **Entire lyrics**

In [None]:
from transformers import AutoModel

In [None]:
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

In [None]:
def get_lyric_embeddings(model, lyrics):
    return model.encode(lyrics)

# **Extracting melspectrogram based features**

In [None]:
from pytube import YouTube as YT
import librosa

In [None]:
def get_audio_file(yt_link):
    audio = YT(yt_link).streams.filter(only_audio=True).first()
    return audio.download()

def get_mel_spectrogram(audio_path):
    audio, sr = librosa.load(audio_path)
    mel = librosa.power_to_db(librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=128), ref=np.max)
    return [audio, sr, mel]

### **Using EfficientNet6B to generate melspectrogram feature maps**

In [None]:
EMBEDDING_DIM = 16

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense

In [None]:
def cnn_top_model(efficientnet_model_output):
  # def inception_block(_input, num_filters, activation, bias, weight_regularizer, bias_regularizer):
  #   x1 = Conv2D(num_filters, kernel_size=1, padding='same', activation=activation, use_bias = bias, kernel_regularizer=weight_regularizer, bias_regularizer = bias_regularizer)(_input)
  #   x2 = Conv2D(num_filters, kernel_size=3, padding='same', activation=activation, use_bias = bias, kernel_regularizer=weight_regularizer, bias_regularizer = bias_regularizer)(_input)
  #   x3 = Conv2D(num_filters, kernel_size=5, padding='same', activation=activation, use_bias = bias, kernel_regularizer=weight_regularizer, bias_regularizer = bias_regularizer)(_input)
  #   x4 = MaxPool2D(pool_size=(3,3), strides=1, padding='same')(_input)
  #   x4 = Conv2D(num_filters, kernel_size=1, padding='same', activation=activation, use_bias = bias, kernel_regularizer=weight_regularizer, bias_regularizer = bias_regularizer)(x4)
  #   x = Concatenate()([x1, x2, x3, x4])
  #   return x
  
  # x = inception_block(efficientnet_model_output, 64, tf.nn.tanh, True, tf.keras.regularizers.l2(3e-3), tf.keras.regularizers.l2(3e-2))
  # x = inception_block(x, 32, tf.nn.tanh, True, tf.keras.regularizers.l2(1e-3), tf.keras.regularizers.l2(1e-2))
  # x = inception_block(x, 16, tf.nn.tanh, True, tf.keras.regularizers.l2(1e-3), tf.keras.regularizers.l2(1e-2))
  x = Flatten()(efficientnet_model_output)
  x = Dense(256, activation=tf.nn.gelu)(x)
  x = Dense(128, activation=tf.nn.gelu)(x)
  x = Dense(32, activation=tf.nn.gelu)(x)
  x = Dense(EMBEDDING_DIM, activation=tf.nn.tanh)(x)

  return x

In [None]:
def image_embedding_model(input_shape):
  _inputs = tf.keras.Input(shape=input_shape)

  base_model = tf.keras.EfficientNetB6(include_top=False, weights="imagenet", input_shape=input_shape)
  for layer in base_model:
    layer.trainable = False

  efficientnet_model_output = base_model(_inputs)
  top_model_output = cnn_top_model(efficientnet_model_output, input_shape)

  model = tf.keras.Model(inputs=_inputs, outputs=top_model_output)

  return model

### **Extracting audio based features from melspectrogram**

In [None]:
def audio_features_from_melspectrogram(y, sr, mel):
    mfccs = librosa.feature.mfcc(S=mel, sr=sr)
    
    spectral_centroid = librosa.feature.spectral_centroid(S=mel, sr=sr)
    
    spectral_bandwidth = librosa.feature.spectral_bandwidth(S=mel, sr=sr)
    
    spectral_contrast = librosa.feature.spectral_contrast(S=mel, sr=sr)
    
    spectral_rolloff = librosa.feature.spectral_rolloff(S=mel, sr=sr)
    
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    
    chroma = librosa.feature.chroma_stft(S=mel, sr=sr)
    
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)

    return [mfccs, spectral_centroid, spectral_bandwidth, spectral_contrast, spectral_rolloff, zero_crossing_rate, chroma, tonnetz, tempo]