In [1]:
from dash import Dash, html, dcc, Input, Output, dash_table
import dash
import pandas as pd
import plotly.express as px

import pandas as pd
from os import path
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation


In [2]:
nlp_file_path = f"CroLyrics_data/nlp_all.csv"
df = pd.read_csv(nlp_file_path)
df_main_words = df[df.upos.isin(["ADV", "ADJ", "NOUN", "VERB", "PROPN", "DET"])].copy() #keep only these word types
df_main_words.dropna(subset=["lemma"], inplace=True)
df_main_words.reset_index(drop=True, inplace=True)
df_main_words["Artist_ID"] = df_main_words.Song_ID.str.split("_").map(lambda x: x[0])


In [3]:
def get_artist_name(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT name FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

In [4]:
df_unique_lemmas_artist = df_main_words.drop_duplicates(subset=["lemma", "Artist_ID"])
df_unique_lemmas_song = df_main_words.drop_duplicates(subset=["lemma", "Song_ID"])

In [5]:
#make tf_idf dataframe 

lemma_frequencies_unique_artist =df_unique_lemmas_artist.lemma.value_counts()
lemma_frequencies_unique_artist = lemma_frequencies_unique_artist[lemma_frequencies_unique_artist>4]#word has to appear in at least x artists

lemma_frequencies_unique_songs = df_unique_lemmas_song.lemma.value_counts()
lemma_frequencies_unique_songs = lemma_frequencies_unique_songs[lemma_frequencies_unique_songs>9]  #word has to appear in at least x songs

lemma_frequencies_unique_songs = lemma_frequencies_unique_songs[lemma_frequencies_unique_songs.index.isin(lemma_frequencies_unique_artist.index)]#word has to appear in at least x artists

num_songs = len(df.Song_ID.unique())
artist_ids = df_main_words.Artist_ID.unique()
df_tf_idf = pd.DataFrame()
for artist in artist_ids:
    lemma_freq_artist = df_unique_lemmas_song[df_unique_lemmas_song.Artist_ID==artist].lemma.value_counts()
    lemma_freq_artist = lemma_freq_artist[(lemma_freq_artist.index.isin(lemma_frequencies_unique_songs.index))]
    lemma_freq_artist = lemma_freq_artist/lemma_freq_artist.sum()
    tf = np.log(1+lemma_freq_artist)
    idf = np.log(num_songs / lemma_frequencies_unique_songs[lemma_freq_artist.index])
    #print(get_artist_name(artist)[0], (tf*idf).sort_values(ascending=False)[0:10])
    df_tf_idf.loc[artist, lemma_freq_artist.index] = (tf*idf)

In [6]:
df_tf_idf = df_tf_idf.replace(0,np.nan).dropna(axis=1,how="all") #drop columns where all values are 0
df_tf_idf = df_tf_idf.fillna(0)
df_tf_idf

Unnamed: 0,sav,moj,taj,samo,ljubav,znati,tvoj,srce,život,mnogo,...,skupo,pokazivati,složiti,zuriti,princeza,bečar,Drava,snaša,ekran,bevanda
zT3Xu5sD,0.008447,0.00997,0.010059,0.012108,0.013058,0.010256,0.01125,0.01209,0.014433,0.014166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Xp4QBL1e,0.009671,0.008467,0.01151,0.009388,0.008164,0.01167,0.007684,0.006869,0.003102,0.006222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fv0Kbouf,0.007101,0.008783,0.009396,0.01041,0.008657,0.009901,0.011152,0.00788,0.006757,0.010162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dPHel+/u,0.007195,0.009719,0.012554,0.00704,0.009893,0.012622,0.011688,0.00738,0.003637,0.008654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wNGUjqQ7,0.009942,0.010681,0.01126,0.010419,0.009432,0.009562,0.008912,0.012881,0.015437,0.008517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b1GNJHQO,0.008616,0.007939,0.010854,0.014224,0.014614,0.009362,0.005342,0.012635,0.010073,0.01313,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538Exp8s,0.008091,0.00999,0.009861,0.01097,0.011655,0.009601,0.011246,0.011128,0.006814,0.008788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RH/jD1hv,0.009734,0.012656,0.009032,0.012718,0.013081,0.009699,0.013347,0.015223,0.010109,0.008827,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wm5tG4b5,0.006464,0.009588,0.009451,0.008712,0.009112,0.00774,0.007001,0.00897,0.012129,0.00551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42+RWAHa,0.008734,0.010429,0.010884,0.011426,0.013088,0.009548,0.011911,0.016017,0.014824,0.007458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#df_tf_idf.drop(["clusters"], axis=1, inplace=True)
pca = PCA(n_components=10)
#distances_pca = pca.fit_transform(distances)
tf_idf_pca = pca.fit_transform(df_tf_idf)

print(np.sum(pca.explained_variance_ratio_))

print(pca.singular_values_)
print(pca.n_samples_, pca.n_features_)


0.47409752718686415
[0.18665609 0.15584944 0.14914888 0.12615037 0.12216796 0.11957515
 0.11121202 0.10727459 0.10066612 0.09824126]
52 2106


In [8]:
df_tf_idf.shape

(52, 2106)

In [9]:

#clustering = AffinityPropagation(random_state=5, convergence_iter=15).fit(distances)
#clustering = AffinityPropagation(random_state=5).fit(distances_pca)
clustering = AffinityPropagation(random_state=5).fit(tf_idf_pca)


clustering.labels_

tf_idf_pca = tf_idf_pca.copy(order='C')

clusters = clustering.predict(tf_idf_pca)
cluster_centers_indices = clustering.cluster_centers_indices_


In [10]:
df_tf_idf["clusters"] = clusters


In [11]:
cluster_counts = df_tf_idf["clusters"].value_counts()
cluster_counts
for c in cluster_counts.index:
    print("/////////")
    print(c)
    for a in df_tf_idf[df_tf_idf.clusters == c].reset_index()["index"]:
        print(get_artist_name(a))

/////////
11
('Crvena Jabuka',)
('Zlatko Pejaković',)
('Leteći Odred',)
('Severina',)
('Danijela Martinović',)
('Novi Fosili',)
('Plavi Orkestar',)
('Vlado Kalember',)
('Jasna Zlokić',)
('Jole',)
('Željko Bebek',)
('Luka Nižetić',)
('Jasmin Stavros',)
('Magazin',)
('Ivan Zak',)
('Gazde',)
/////////
2
('Gibonni',)
('Colonia',)
('Jacques Houdek',)
('Josipa Lisac',)
('Tony Cetinski',)
('Nina Badrić',)
('Vanna',)
('Boris Novković',)
('Divlje Jagode',)
/////////
1
('Mišo Kovač',)
('Doris Dragović',)
('Petar Grašo',)
('Oliver Dragojević',)
('Tereza Kesovija',)
('Goran Karan',)
/////////
5
('Thompson',)
('Miroslav Škoro',)
('Slavonske Lole',)
('Najbolji Hrvatski Tamburaši',)
('Mate Bulić',)
('Krunoslav Kićo Slabinac',)
/////////
0
('Prljavo Kazalište',)
('Parni Valjak',)
('Zabranjeno Pušenje',)
('Bijelo Dugme',)
('Hladno Pivo',)
/////////
7
('Haustor',)
('Darko Rundek',)
/////////
9
('Mladen Grdović',)
('Vinko Coce',)
/////////
3
('Maja Šuput',)
/////////
4
('Aerodrom',)
/////////
6
('Baruni'

In [12]:
##take the clusters that have only one (or few) members and calculate distance to the representatives of other clusters, and find the one which is closest

for index, value in zip(cluster_counts.index, cluster_counts.values):
    if value < 3:
        artist_codes = df_tf_idf[df_tf_idf.clusters == index].reset_index()["index"]
        for artist_code in artist_codes:
            print(get_artist_name(artist_code))
            coordinates = tf_idf_pca[cluster_centers_indices[index]]
            other_clusters_dist=[]        
            for k in cluster_centers_indices:
                other_clusters_dist.append(np.round(euclidean_distances(coordinates.reshape(1, -1), tf_idf_pca[k].reshape(1, -1))[0][0], 5))

            masked_a = np.ma.masked_equal(other_clusters_dist, 0.0, copy=False)
            c = cluster_centers_indices[other_clusters_dist.index(masked_a.min())]
            print(get_artist_name(df_tf_idf.iloc[c].name))
            print("////")
            
            ##reassign to new cluster
            df_tf_idf.loc[artist_code, "clusters"] = df_tf_idf.iloc[c].clusters


('Haustor',)
('Prljavo Kazalište',)
////
('Darko Rundek',)
('Prljavo Kazalište',)
////
('Mladen Grdović',)
('Oliver Dragojević',)
////
('Vinko Coce',)
('Oliver Dragojević',)
////
('Maja Šuput',)
('Colonia',)
////
('Aerodrom',)
('Colonia',)
////
('Baruni',)
('Magazin',)
////
('Psihomodo Pop',)
('Prljavo Kazalište',)
////
('Maja Šuput & EnJoy',)
('Magazin',)
////
('Dalmatino',)
('Oliver Dragojević',)
////


In [13]:
cluster_counts = df_tf_idf["clusters"].value_counts()
cluster_counts
for c in cluster_counts.index:
    print("/////////")
    print("Representative of the cluster")
    print(get_artist_name(df_tf_idf.iloc[cluster_centers_indices[c]].name))
    print("/////////")
    for a in df_tf_idf[df_tf_idf.clusters == c].reset_index()["index"]:
        print(get_artist_name(a))
    print("/////////")


/////////
Representative of the cluster
('Magazin',)
/////////
('Crvena Jabuka',)
('Zlatko Pejaković',)
('Leteći Odred',)
('Severina',)
('Danijela Martinović',)
('Novi Fosili',)
('Plavi Orkestar',)
('Baruni',)
('Vlado Kalember',)
('Jasna Zlokić',)
('Jole',)
('Željko Bebek',)
('Luka Nižetić',)
('Jasmin Stavros',)
('Maja Šuput & EnJoy',)
('Magazin',)
('Ivan Zak',)
('Gazde',)
/////////
/////////
Representative of the cluster
('Colonia',)
/////////
('Gibonni',)
('Colonia',)
('Maja Šuput',)
('Aerodrom',)
('Jacques Houdek',)
('Josipa Lisac',)
('Tony Cetinski',)
('Nina Badrić',)
('Vanna',)
('Boris Novković',)
('Divlje Jagode',)
/////////
/////////
Representative of the cluster
('Oliver Dragojević',)
/////////
('Mišo Kovač',)
('Doris Dragović',)
('Petar Grašo',)
('Oliver Dragojević',)
('Tereza Kesovija',)
('Mladen Grdović',)
('Vinko Coce',)
('Goran Karan',)
('Dalmatino',)
/////////
/////////
Representative of the cluster
('Prljavo Kazalište',)
/////////
('Prljavo Kazalište',)
('Parni Valjak',)