In [22]:
import numpy as np
import pandas as pd
import ampligraph
import tensorflow as tf

## Dataset
We will use a set of 1000 songs parsed using yandex music API. A csv file was created with the following columns:
- song
- artist
- album
- album_id
- time
- playlist
- lyrics
- genre

In [23]:
df = pd.read_csv('../../data/yandex.csv')

In [24]:
df.head()

Unnamed: 0,id,song,artist,album,album_id,time,playlist,lyrics,genre,path
0,106259,Группа крови,КИНО,Группа крови,10100,2025-01-12 19:19:14.546605,Лучшие песни русского рока,"Тёплое место, но улицы ждут\nОтпечатков наших ...",rock,
1,25949305,Дорога в облака,Браво,Дорога в облака,32851240,2025-01-12 19:19:14.962741,Лучшие песни русского рока,,rock,
2,34905249,Поворот,Машина времени,Лучшие песни 1979-1985,4741960,2025-01-12 19:19:15.703078,Лучшие песни русского рока,Мы себе давали слово - не сходить с пути прямо...,rock,
3,34084451,Моё сердце,Сплин,25-й кадр,4181557,2025-01-12 19:19:16.410526,Лучшие песни русского рока,Мы не знали друг друга до этого лета\nМы болта...,rock,
4,732394,почему,Земфира,земфира,81430,2025-01-12 19:19:17.166311,Лучшие песни русского рока,Ты стучала в дверь открытую\nЯ молчала как уби...,rock,


In [25]:
df['train'] = np.random.choice([True, False], df.shape[0], p=[0.8, 0.2])

### Create artist dataset
We create a set from the artist and genre columns.

In [26]:
from knowledge_graph.utils import get_artist_genre_set

artists_df = get_artist_genre_set(df)
artists_df.to_csv('../../data/artists.csv', index=False)
artists_df.head()

Unnamed: 0,artist,genre
0,кино,rock
1,браво,rock
2,машина_времени,rock
3,сплин,rock
4,земфира,rock


## Knowledge graph creation
We are going to create a knowledge graph from scratch based on the match information. The idea is that each song will be connected to the artist, album, genre and playlist. We will also connect the artist to the genre and the album to the genre.

We will use the class Ontology and GraphGenerator from the knowledge_graph module.

In [27]:
from knowledge_graph.ontologies import Ontology
from knowledge_graph.graphs import GraphGenerator


ontology = Ontology(
    properties={
        "by": {"type": "DatatypeProperty", "domain": "Artist"},
        "inAlbum": {"type": "DatatypeProperty", "domain": "Album"},
        "inGenre": {"type": "DatatypeProperty", "domain": "Genre"},
        "playsGenre": {"type": "DatatypeProperty", "domain": "Genre"},
        "inPlaylist": {"type": "DatatypeProperty", "domain": "Playlist"},
    },
    classes=["Song", "Artist", "Album", "Genre", "Playlist"]
)

generator = GraphGenerator(ontology, namespace="music:")
generator.load_dataset("../../data/yandex.csv", "song", ["by", "inAlbum", "inGenre", "inPlaylist"])
generator.load_dataset("../../data/artists.csv", "artist", ["playsGenre"])

We can now generate the graph and save it to a file. And save triples for the training of the embeddings.

In [28]:
generator.serialize("music_graph.owl", format="xml")
generator.save_triplets("music_triplets.csv")

In [29]:
triples_df = pd.read_csv("music_triplets.csv")
triples_df.head()

Unnamed: 0,subject,predicate,object
0,breezeblocks,by,altj
1,смех_в_суде,inGenre,jazz
2,мои_берега_prod_by_mal0,inPlaylist,новый_локальный_фолк
3,heartbeat,inPlaylist,блюз_лучшее
4,дядя_ваня,playsGenre,blues


In [30]:
triples_df["predicate"].unique()

array(['by', 'inGenre', 'inPlaylist', 'playsGenre', 'inAlbum'],
      dtype=object)

In [31]:
triples_df[(triples_df.subject=="спокойная_ночь")]

Unnamed: 0,subject,predicate,object
186,спокойная_ночь,inPlaylist,лучшие_песни_русского_рока
1252,спокойная_ночь,by,кино
3886,спокойная_ночь,inAlbum,легенда
4213,спокойная_ночь,inGenre,rock


## Training knowledge graph embeddings

In [32]:
generator.save_triplets("music_triplets_ids.csv", as_ids=True)
triples_df = pd.read_csv("music_triplets_ids.csv")

In [33]:
np.array(triples_df)

array([['Song_891', 'by', 'Artist_517'],
       ['Song_392', 'inGenre', 'Genre_4'],
       ['Song_1005', 'inPlaylist', 'Playlist_20'],
       ...,
       ['Song_128', 'inGenre', 'Song_133'],
       ['Song_1049', 'by', 'Artist_760'],
       ['Song_147', 'inAlbum', 'Album_129']], dtype=object)

In [34]:
from ampligraph.evaluation import train_test_split_no_unseen

X_train, X_valid = train_test_split_no_unseen(np.array(triples_df), test_size=500)

In [35]:
print('Train set size: ', X_train.shape)
print('Test set size: ', X_valid.shape)

Train set size:  (4649, 3)
Test set size:  (500, 3)


In [None]:
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer

model = ScoringBasedEmbeddingModel(k=100,
                                   eta=20,
                                   scoring_type='ComplEx',
                                   seed=0)

# Optimizer, loss and regularizer definition
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss = get_loss('multiclass_nll')
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

# Compilation of the model
model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer)

### Training model

In [None]:
model.fit(X_train,
          batch_size=int(X_train.shape[0] / 50),
          epochs=300,
          verbose=True
          )

### Evaluation
The training process was done using google colaboratory. Got the following results:

In [None]:
ranks = model.evaluate(X_valid,
                      use_filter={'train': X_train,
                                  'test': X_valid},
                      corrupt_side='s,o',
                      verbose=True)

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

```
MRR: 0.55
MR: 37.44
Hits@10: 0.63
Hits@3: 0.56
Hits@1: 0.50
```

### Cauterization by genre

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
from incf.countryutils import transformations
%matplotlib inline

In [None]:
triples_ids_df = pd.read_csv("music_triplets_ids.csv")
triples_df = pd.read_csv("music_triplets.csv")

In [None]:
id_to_name_map = {**dict(zip(triples_ids_df["subject"], triples_df["subject"])), **dict(zip(triples_ids_df["object"], triples_df["object"]))}

In [None]:
songs_df = pd.DataFrame(X_train, columns=["subject", "predicate", "object"])
songs_df = songs_df[songs_df["subject"].str.contains("Song", na=False)]
songs_df["song"] = songs_df["subject"].map(id_to_name_map)

In [None]:
songs = songs_df["subject"].unique()
songs_embeddings = dict(zip(songs, model.get_embeddings(songs)))
embeddings_2d = PCA(n_components=2).fit_transform(np.array([i for i in songs_embeddings.values()]))

In [None]:
from ampligraph.discovery import find_clusters
from sklearn.cluster import KMeans

clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = find_clusters(songs, model, clustering_algorithm, mode='e')

In [None]:
plot_df = pd.DataFrame({"songs": songs,
                        "embedding1": embeddings_2d[:, 0],
                        "embedding2": embeddings_2d[:, 1],
                        "cluster": "cluster" + pd.Series(clusters).astype(str)})

In [None]:
rock_songs = ["московский_бит","девушка_по_городу","ты_не_один","ариведерчи",
"ленинградский_рокнролл","поэзия","хару_мамбуру","феллини","эльдорадо","искала","иду_курю"]

jazz_songs = ["смех_в_суде", "the_little_swallow", "чувство_спокойный_работа_из_дома", "атмосфера_гостиная_в_отеле"]

def plot_clusters(hue):
    np.random.seed(0)
    plt.figure(figsize=(12, 12))
    plt.title("{} embeddings".format(hue).capitalize())
    ax = sns.scatterplot(data=plot_df, x="embedding1", y="embedding2", hue=hue)
    texts = []
    for i, point in plot_df.iterrows() :
        title = id_to_name_map[point["songs"]]
        if title in rock_songs or title in jazz_songs:
            texts.append(plt.text(point['embedding1']+0.02, point['embedding2']+0.01, str(title)))
    adjust_text(texts)

In [None]:
plot_clusters("cluster")

![Rock songs](../../media/rock_jazz_songs.png)
