## NLP mix: ritmos distintos que dicen lo mismo

In [None]:
from typing import List, Tuple
import json
import pandas as pd
import lyricsgenius as lg
from tqdm import tqdm
from pysentimiento import create_analyzer
from flair.data import Sentence
from flair.models import SequenceTagger
import yake
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
kw_extractor = yake.KeywordExtractor(lan="es", top=5)
nlp = spacy.load("es_core_news_sm")
nlp.add_pipe("textrank")
tagger = SequenceTagger.load("flair/ner-spanish-large")
ner = pipeline("ner", model="dccuchile/bert-base-spanish-wwm-uncased", grouped_entities=True)

## Build Dataset

In [None]:
def get_songs_by_artist(artist_name: str, max_songs: int = 40) -> Tuple[List[str], List[str], List[str]]:
    """
    Fetches song titles and lyrics for a given artist using the Genius API.

    Args:
        artist_name (str): The name of the artist to search for.
        max_songs (int): The maximum number of songs to retrieve. Defaults to 40.

    Returns:
        Tuple[List[str], List[str], List[str]]: Lists of artist names, song titles, and lyrics.
    """
    genius.skip_non_songs = True
    genius.excluded_terms = ["(Remix)", "(Live)"]
    genius.remove_section_headers = True
    genius.verbose = False

    artist_obj = genius.search_artist(artist_name, max_songs=max_songs, sort="popularity")

    artist_list = []
    title_list = []
    lyrics_list = []

    for song in artist_obj.songs:
        artist_list.append(song.artist)
        title_list.append(song.title)
        try:
            lyrics_list.append(song.lyrics)
        except Exception:
            lyrics_list.append("Lyrics not found")

    return artist_list, title_list, lyrics_list


def clean_text_from_keywords(text: str, keywords: List[str]) -> str:
    """
    Cleans text by iteratively trimming content before each keyword found.
    Keeps only the text after the last matched keyword.

    Args:
        text (str): The input text to clean.
        keywords (List[str]): A list of lowercase keywords to search for.

    Returns:
        str: The cleaned text after the last found keyword.
    """
    text_lower = text.lower()
    for word in keywords:
        if word in text_lower:
            parts = text_lower.split(word, 1)
            text = parts[1]  # Keep only the part after the keyword
    return text.strip()

In [None]:
credentials_path = ''

In [None]:
with open(credentials_path, 'r') as file:
    credentials = json.load(file)

## API genius

In [None]:
api_keyLlyrics = credentials["api_genius"]["acces_token"]
genius = lg.Genius(api_keyLlyrics)

In [None]:
l_artists = ["Andrea echeverri","Bad Bunny","Dario Gomez", "Diomedes Diaz","Soda Stereo",
            "Mercedes Soza", "Helenita Vargas", "Ismael Rivera","Shakira",
            "Karol g","Eladio Carrion", "Celia Cruz","Willie Colon",
            "Garzón y Collazos","Julio Jaramillo","El caballero Gaucho","Miguel Mateos",
             "Miriam Hernández","Young Miko","Kany Garcia","Natalia Jiménez"
            "ekhymosis","Sergio Vargas", "Romeo Santos","Silvio Rodrigez",
            "alcolirykoz","Penyair","Angela Aguilar","La India","ivy queen",
            "proyecto uno","Javier Solis","el alfa","Peso Pluma", 
            "Fuerza Regida"]

In [None]:
df_artist_result = pd.DataFrame()
for artist in tqdm(l_artists):
    df_artist = pd.DataFrame()
    l_artist, l_titles, l_letters = get_songs_by_artist(artist,max_song=60)
    df_artist["artist"] = [artist]*len(l_letters)
    df_artist["title"] = l_titles
    df_artist["letter"] = l_letters
    df_artist_result = pd.concat([df_artist_result,df_artist])

In [None]:
df_artist_result.to_pickle("")

## Group verses

In [None]:
def group_verses_by_author(df: pd.DataFrame, lyrics_column: str, group_size: int = 3) -> pd.DataFrame:
    """
    Splits song lyrics into groups of verses and associates each group with the original artist and title.

    Args:
        df (pd.DataFrame): A DataFrame with at least 'artist', 'title', and the lyrics column.
        lyrics_column (str): The name of the column that contains the song lyrics.
        group_size (int): The number of verses to concatenate per group. Defaults to 3.

    Returns:
        pd.DataFrame: A new DataFrame with columns 'artist', 'title', and 'verse_group',
                      where each row contains a group of `group_size` verses.
    """
    artists, titles, grouped_verses = [], [], []

    for _, row in df.iterrows():
        artist = row["artist"]
        title = row["title"]
        lyrics = row[lyrics_column]

        # Split lyrics into clean, non-empty lines
        verses = [v.strip() for v in str(lyrics).split('\n') if v.strip()]

        # Group verses
        for i in range(0, len(verses), group_size):
            group = verses[i:i + group_size]
            combined = " ".join(group)
            artists.append(artist)
            titles.append(title)
            grouped_verses.append(combined)

    return pd.DataFrame({
        "artist": artists,
        "title": titles,
        "verse_group": grouped_verses
    })

In [None]:
df_verses_4 = group_verses_by_author(df_artist_result,"clean_lyrics",4)
df_verses_4.shape

## Análisis de sentimientos y emociones

In [None]:
analyzer_sen = create_analyzer(task="sentiment", lang="es")
analyzer_emo = create_analyzer(task="emotion", lang="es")

In [None]:
def sentiment_analysis(text):
    text = text.replace('\n',' ')
    result_sen = analyzer_sen.predict(text)
    proba = result_sen.probas
    return proba['NEG'],proba['NEU'],proba['POS'], result_sen.output
def emotion_analysis(text):
    text = text.replace('\n',' ')
    result_emo = analyzer_emo.predict(text)
    proba = result_emo.probas
    return proba['joy'],proba['surprise'],proba['sadness'],proba['disgust'],proba['fear'],proba['anger'],proba['others'], result_emo.output

In [None]:
df_verses_4[['chorus_neg', 'chorus_neu', 'chorus_pos','chorus_sen']] = df_verses_4['verse_group'].apply(lambda x: pd.Series(sentiment_analysis(x)))

In [None]:
df_verses_4[['letter_joy', 'letter_surprise', 
            'letter_sadness','letter_disgust',
            'letter_fear','letter_anger',
            'letter_others','letter_emo'
            ]] = df_verses_4['verse_group'].apply(lambda x: pd.Series(emotion_analysis(x)))

## Keywords

In [None]:
def keyw_yake(text):
    l_k_words = []
    keywords = kw_extractor.extract_keywords(text)

    for palabra, score in keywords:
        l_k_words.append(palabra)
    return l_k_words
def keyw_rank(text):
    l_k_words = []
    doc = nlp(text)
    for phrase in doc._.phrases[:5]:
        l_k_words.append(phrase.text)
    return l_k_words

In [None]:
df_verses_4['keyw_yake']=df_verses_4['verse_group_clean'].apply(keyw_yake)
df_verses_4['keyw_rank']=df_verses_4['verse_group_clean'].apply(keyw_rank)

## Cluster and embeddings calculate

In [None]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
# Obtener combinaciones únicas
comb_sen = df_verses_4[['chorus_sen', 'letter_emo']].drop_duplicates()

# Crear sub-dataframes en un diccionario
sub_dataframes = {
    (fila['chorus_sen'], fila['letter_emo']): df_verses_4[
        (df_verses_4['chorus_sen'] == fila['chorus_sen']) & (df_verses_4['letter_emo'] == fila['letter_emo'])
    ]
    for _, fila in comb_sen.iterrows()
}

In [None]:
size_cluster = {}
for comb in tqdm(sub_dataframes.keys()):
    df_sub_lyrycs = sub_dataframes[comb].reset_index()
    embeddings = model.encode(df_sub_lyrycs['verse_group'])
    inertia = []
    K = range(1, int(len(df_sub_lyrycs)/20))
    try:
        for k in tqdm(K):
            mbk = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=128)
            mbk.fit(embeddings)
            inertia.append(mbk.inertia_)

        # Calcular segunda derivada aproximada para detectar el "codo"
        inertia_diff = np.diff(inertia)
        inertia_diff2 = np.diff(inertia_diff)
        k_opt = np.argmin(inertia_diff2) + 2  # +2 porque diff reduce longitud y empieza en k=3
        size_cluster[comb]=k_opt
    except:
        size_cluster[comb]=0

In [None]:
labels_cluster = []
dict_embeddings = {}
dict_labels = {}
for comb in tqdm(size_cluster.keys()):
    k=size_cluster[comb]
    df_cluster = sub_dataframes[comb].reset_index()
    embeddings = model.encode(df_cluster['verse_group'])
    dict_embeddings[comb]=embeddings
    if k == 0:
        labels_ = [0]*len(df_cluster)
    else:
        clustering = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=128)
        labels_ = clustering.fit_predict(embeddings)
    dict_labels[comb]=labels_

In [None]:
for comb in tqdm(sub_dataframes.keys()):
    sub_dataframes[comb]['cluster_label']=dict_labels[comb]

In [None]:
clusters_dataframes = {}
drop_col = ['chorus_neg','chorus_neu','chorus_pos','letter_joy','letter_surprise','letter_sadness','letter_disgust',
            'letter_fear','letter_anger','letter_others','keyw_bert','keyw_yake','keyw_rank']
for comb in sub_dataframes.keys():
    sub_dataframes[comb]['key_words'] = sub_dataframes[comb]['keyw_yake'] + sub_dataframes[comb]['keyw_rank']
    clusters_dataframes[comb] = sub_dataframes[comb].drop(columns=drop_col)

## Analysis

In [None]:
def comb_from_artis(dict_df, artist):
    artist_combs = []
    for comb in dict_df.keys():
        if artist in dict_df[comb]['artist'].unique():
            artist_combs.append(comb)
    return artist_combs
def title_from_comb_artist(dict_df, artist, comb):
    df = dict_df[comb]
    df_artist = df[df['artist']==artist]

    if len(df_artist)==0:
        print('Not Artist for comb')
    else:
        return df_artist['title'].unique()
def verse_from_attributes(dict_df,artist,comb,title):
    df = dict_df[comb]
    df_filter = df[(df['artist']==artist)&(df['title']==title)]
    if len(df_filter)==0:
        print('Not verse for comb')
    else:
        return df_filter['verse_group'].unique()
def random_verse_from_attributes(dict_df, artist, comb, title):
    df = dict_df[comb]
    df_filter = df[(df['artist'] == artist) & (df['title'] == title)]
    
    if df_filter.empty:
        print('No verses found for this combination.')
        return None
    else:
        verses = df_filter['verse_group'].dropna().unique()
        if len(verses) == 0:
            print('No verses available.')
            return None
        return random.choice(verses)
def att_from_verse(dict_df,verse):
    att = {}
    for comb in dict_df.keys():
        df = dict_df[comb]
        if verse in df['verse_group'].unique():
            df_verse = df[df['verse_group']==verse].reset_index()
            att = dict(df_verse.iloc[0])
    return att  
def cluster_from_verse(dict_df,verse):
    df_cluster = pd.DataFrame()
    for comb in dict_df.keys():
        df = dict_df[comb]
        if verse in df['verse_group'].unique():
            df_verse = df[df['verse_group']==verse].reset_index()
            cluster_verse = df_verse.iloc[0]['cluster_label']
            df_cluster = df[(df['cluster_label']==cluster_verse)&
                            (df['verse_group']!=verse)]
    return df_cluster.drop_duplicates(subset=['verse_group'])
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union else 0
def top_k_jaccard(df, keywords_column, query_keywords, k=5):
    query_set = set(query_keywords)

    # Calcular distancia de Jaccard para cada fila
    distances = []
    for keywords in df[keywords_column]:
        row_set = set(keywords)
        distance = jaccard_distance(query_set, row_set)
        distances.append(distance)

    df_copy = df.copy()
    df_copy["jaccard_distance"] = distances

    # Ordenar por menor distancia y retornar top-k
    return df_copy.sort_values("jaccard_distance").head(k).reset_index(drop=True)
def top_k_cosine_from_keywords(df, keywords_column, query_keywords, k=5):
    # Convertir lista de keywords a texto (espacios entre palabras)
    df_keywords_text = df[keywords_column].apply(lambda x: ' '.join(x))
    query_text = ' '.join(query_keywords)

    # Crear matriz TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df_keywords_text.tolist() + [query_text])

    # Última fila es la query
    query_vector = tfidf_matrix[-1]
    data_matrix = tfidf_matrix[:-1]

    # Calcular distancias de coseno
    distances = cosine_distances(data_matrix, query_vector).flatten()

    # Crear nuevo DataFrame con distancias
    df_copy = df.copy()
    df_copy["cosine_distance"] = distances

    # Ordenar por menor distancia
    return df_copy.sort_values("cosine_distance").head(k).reset_index(drop=True)

In [None]:
comb_from_artis(clusters_dataframes,'Diomedes Diaz')

In [None]:
clusters_dataframes[('NEU', 'fear')]['artist'].unique()

In [None]:
title_from_comb_artist(clusters_dataframes,'Diomedes Diaz',('NEG', 'sadness'))

In [None]:
verse_proof = random_verse_from_attributes(clusters_dataframes,'Ismaie',('NEG', 'anger'),'No Me Toquen Ese Vals')
data_verse = att_from_verse(clusters_dataframes,verse_proof)
df_cluster_verse = cluster_from_verse(clusters_dataframes,verse_proof)
df_top_jacc = top_k_jaccard(df_cluster_verse,"key_words", data_verse['key_words'], k=5)
df_top_emb = top_k_cosine_from_keywords(df_cluster_verse,"key_words", data_verse['key_words'], k=5)