In [16]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation

In [17]:
nlp_file_path = f"CroLyrics_data/nlp_all.csv"
df = pd.read_csv(nlp_file_path)
df_main_words = df[df.upos.isin(["ADV", "ADJ", "NOUN", "VERB", "PROPN", "DET"])].copy() #keep only these word types
df_main_words.dropna(subset=["lemma"], inplace=True)
df_main_words.reset_index(drop=True, inplace=True)
df_main_words["Artist_ID"] = df_main_words.Song_ID.str.split("_").map(lambda x: x[0])


In [18]:
def get_artist_name(artist_code):
    conn = sqlite3.connect('CroLyrics_data/info.db')
    c = conn.cursor()
    c.execute("SELECT name FROM artists WHERE code=:code", {'code': artist_code})
    return c.fetchone()

In [19]:
df_unique_lemmas_artist = df_main_words.drop_duplicates(subset=["lemma", "Artist_ID"])
df_unique_lemmas_song = df_main_words.drop_duplicates(subset=["lemma", "Song_ID"])

In [20]:
#make tf_idf dataframe 

lemma_frequencies_unique_artist =df_unique_lemmas_artist.lemma.value_counts()
lemma_frequencies_unique_artist = lemma_frequencies_unique_artist[lemma_frequencies_unique_artist>4]#word has to appear in at least x artists

lemma_frequencies_unique_songs = df_unique_lemmas_song.lemma.value_counts()
lemma_frequencies_unique_songs = lemma_frequencies_unique_songs[lemma_frequencies_unique_songs>9]  #word has to appear in at least x songs

lemma_frequencies_unique_songs = lemma_frequencies_unique_songs[lemma_frequencies_unique_songs.index.isin(lemma_frequencies_unique_artist.index)]#word has to appear in at least x artists

num_songs = len(df.Song_ID.unique())
artist_ids = df_main_words.Artist_ID.unique()
df_tf_idf = pd.DataFrame()
for artist in artist_ids:
    lemma_freq_artist = df_unique_lemmas_song[df_unique_lemmas_song.Artist_ID==artist].lemma.value_counts()
    lemma_freq_artist = lemma_freq_artist[(lemma_freq_artist.index.isin(lemma_frequencies_unique_songs.index))]
    lemma_freq_artist = lemma_freq_artist/lemma_freq_artist.sum()
    tf = np.log(1+lemma_freq_artist)
    idf = np.log(num_songs / lemma_frequencies_unique_songs[lemma_freq_artist.index])
    df_tf_idf.loc[artist, lemma_freq_artist.index] = (tf*idf)

In [21]:
df_tf_idf = df_tf_idf.replace(0,np.nan).dropna(axis=1,how="all") #drop columns where all values are 0
df_tf_idf = df_tf_idf.fillna(0)
df_tf_idf.index = [get_artist_name(i)[0] for i in df_tf_idf.index]
df_tf_idf.head(5)

Unnamed: 0,sav,moj,taj,samo,ljubav,znati,tvoj,srce,život,mnogo,...,skupo,pokazivati,složiti,zuriti,princeza,bečar,Drava,snaša,ekran,bevanda
Mišo Kovač,0.008447,0.00997,0.010059,0.012108,0.013058,0.010256,0.01125,0.01209,0.014433,0.014166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Prljavo Kazalište,0.009671,0.008467,0.01151,0.009388,0.008164,0.01167,0.007684,0.006869,0.003102,0.006222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Crvena Jabuka,0.007101,0.008783,0.009396,0.01041,0.008657,0.009901,0.011152,0.00788,0.006757,0.010162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gibonni,0.007195,0.009719,0.012554,0.00704,0.009893,0.012622,0.011688,0.00738,0.003637,0.008654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zlatko Pejaković,0.009942,0.010681,0.01126,0.010419,0.009432,0.009562,0.008912,0.012881,0.015437,0.008517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#df_tf_idf.drop(["clusters"], axis=1, inplace=True)
pca = PCA(n_components=10, random_state=42)
tf_idf_pca = pca.fit_transform(df_tf_idf)

print(np.sum(pca.explained_variance_ratio_))

print(pca.singular_values_)
print(pca.n_samples_, pca.n_features_)


0.47409481114206226
[0.1866561  0.15584943 0.14914961 0.12615655 0.12217117 0.11957741
 0.11123976 0.1072669  0.10067015 0.09819336]
52 2106


In [23]:
df_tf_idf.shape

(52, 2106)

In [24]:

clustering = AffinityPropagation(random_state=42).fit(tf_idf_pca)


clustering.labels_

tf_idf_pca = tf_idf_pca.copy(order='C')

clusters = clustering.predict(tf_idf_pca)
cluster_centers_indices = clustering.cluster_centers_indices_


In [25]:
df_tf_idf["clusters"] = clusters


In [26]:
cluster_counts = df_tf_idf["clusters"].value_counts()
cluster_counts
for c in cluster_counts.index:
    print("/////////")
    print(c)
    for a in df_tf_idf[df_tf_idf.clusters == c].reset_index()["index"]:
        print(a)

/////////
11
Crvena Jabuka
Zlatko Pejaković
Leteći Odred
Severina
Danijela Martinović
Novi Fosili
Plavi Orkestar
Vlado Kalember
Jasna Zlokić
Jole
Željko Bebek
Luka Nižetić
Jasmin Stavros
Magazin
Ivan Zak
Gazde
/////////
2
Gibonni
Colonia
Jacques Houdek
Josipa Lisac
Tony Cetinski
Nina Badrić
Vanna
Boris Novković
Divlje Jagode
/////////
1
Mišo Kovač
Doris Dragović
Petar Grašo
Oliver Dragojević
Tereza Kesovija
Goran Karan
/////////
5
Thompson
Miroslav Škoro
Slavonske Lole
Najbolji Hrvatski Tamburaši
Mate Bulić
Krunoslav Kićo Slabinac
/////////
0
Prljavo Kazalište
Parni Valjak
Zabranjeno Pušenje
Bijelo Dugme
Hladno Pivo
/////////
7
Haustor
Darko Rundek
/////////
9
Mladen Grdović
Vinko Coce
/////////
3
Maja Šuput
/////////
4
Aerodrom
/////////
6
Baruni
/////////
8
Psihomodo Pop
/////////
10
Maja Šuput & EnJoy
/////////
12
Dalmatino


In [27]:
cluster_names_dict = {}
for i in range(len(cluster_counts)):
    cluster_names_dict[i] = np.nan
cluster_names_dict[11] = "Rujanfest"
cluster_names_dict[2] = "Urbano"
cluster_names_dict[1] = "Dalmacija"
cluster_names_dict[5] = "Domovina"
cluster_names_dict[0] = "Cro-rock"


In [28]:
df_tf_idf["cluster_name"] = df_tf_idf.clusters.apply(lambda x: cluster_names_dict[x])

In [29]:
##take the clusters that have only one (or few) members and calculate distance to the representatives of other clusters, and find the one which is closest

for index, value in zip(cluster_counts.index, cluster_counts.values):
    if value < 3:
        artist_name = df_tf_idf[df_tf_idf.clusters == index].reset_index()["index"]
        for artist_name in artist_name:
            print(artist_name)
            coordinates = tf_idf_pca[cluster_centers_indices[index]]
            other_clusters_dist=[]        
            for k in cluster_centers_indices:
                other_clusters_dist.append(np.round(euclidean_distances(coordinates.reshape(1, -1), tf_idf_pca[k].reshape(1, -1))[0][0], 5))

            masked_a = np.ma.masked_equal(other_clusters_dist, 0.0, copy=False)
            c = cluster_centers_indices[other_clusters_dist.index(masked_a.min())]
            print(df_tf_idf.iloc[c].cluster_name)
            print("////")
            
            ##reassign to new cluster
            df_tf_idf.loc[artist_name, "clusters"] = df_tf_idf.iloc[c].clusters
            df_tf_idf.loc[artist_name, "cluster_name"] = df_tf_idf.iloc[c].cluster_name


Haustor
Cro-rock
////
Darko Rundek
Cro-rock
////
Mladen Grdović
Dalmacija
////
Vinko Coce
Dalmacija
////
Maja Šuput
Urbano
////
Aerodrom
Urbano
////
Baruni
Rujanfest
////
Psihomodo Pop
Cro-rock
////
Maja Šuput & EnJoy
Rujanfest
////
Dalmatino
Dalmacija
////


In [30]:
cluster_counts = df_tf_idf["clusters"].value_counts()
cluster_counts
for c in cluster_counts.index:
    print("/////////")
    print(cluster_names_dict[c])
    print("/////////")
    for a in df_tf_idf[df_tf_idf.clusters == c].reset_index()["index"]:
        print(a)
    print("/////////")


/////////
Rujanfest
/////////
Crvena Jabuka
Zlatko Pejaković
Leteći Odred
Severina
Danijela Martinović
Novi Fosili
Plavi Orkestar
Baruni
Vlado Kalember
Jasna Zlokić
Jole
Željko Bebek
Luka Nižetić
Jasmin Stavros
Maja Šuput & EnJoy
Magazin
Ivan Zak
Gazde
/////////
/////////
Urbano
/////////
Gibonni
Colonia
Maja Šuput
Aerodrom
Jacques Houdek
Josipa Lisac
Tony Cetinski
Nina Badrić
Vanna
Boris Novković
Divlje Jagode
/////////
/////////
Dalmacija
/////////
Mišo Kovač
Doris Dragović
Petar Grašo
Oliver Dragojević
Tereza Kesovija
Mladen Grdović
Vinko Coce
Goran Karan
Dalmatino
/////////
/////////
Cro-rock
/////////
Prljavo Kazalište
Parni Valjak
Zabranjeno Pušenje
Haustor
Bijelo Dugme
Psihomodo Pop
Darko Rundek
Hladno Pivo
/////////
/////////
Domovina
/////////
Thompson
Miroslav Škoro
Slavonske Lole
Najbolji Hrvatski Tamburaši
Mate Bulić
Krunoslav Kićo Slabinac
/////////
