This is the notebook for initial testing out of and experimentation with the recommendation system. Code is finally refined and broken into `preprocessing.py` and `recommender.py`

Installing all needed libraries

In [None]:
%pip install numpy
%pip install pandas
%pip install sklearn
%pip install seaborn

Importing the installed libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb

Reading and preprocessing the music data 

In [None]:
data = pd.read_csv('spotify_songs.csv')
data = data[data['language'] == 'en']
data.drop(columns=['language', 'playlist_name', 'playlist_id'], inplace=True)
data = data.drop_duplicates(subset=['track_name', 'track_artist'])
data['track_album_release_date'] = pd.to_datetime(data['track_album_release_date'], infer_datetime_format=True)
data = data.sort_values(by=['track_album_release_date'])
data.reset_index(drop=True, inplace=True)
print(data.columns)
songs_count = data.shape[0]
print(songs_count)

Sectioning off data for recommendation subsystems

In [None]:
lyrics_data = data['lyrics']
energy_data = data[['danceability', 'tempo', 'acousticness']]
mood_data = data[['mode', 'key', 'valence']]

Using cosine similarity and Tfidf for making lyrics comparable

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

lyric_vectorizer = TfidfVectorizer(stop_words='english')
lyrics_data = lyric_vectorizer.fit_transform(lyrics_data)
lyric_similarity_matrix = cosine_similarity(lyrics_data)

Using euclidean distance for making energy and mood comparable

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

energy_difference_matrix = euclidean_distances(energy_data)
mood_difference_matrix = euclidean_distances(mood_data)

Utility functions

In [None]:
def sort_by_popularity(songs, descending=True):
    if descending:
        return songs.sort_values(by=['track_popularity'])[::-1]
    else:
        return songs.sort_values(by=['track_popularity'])


def get_similar(track_index, count, comparison_matrix, select_smallest):
    similar_songs_indexes = np.argsort(np.array(comparison_matrix[track_index]))
    similar_songs_indexes = np.delete(similar_songs_indexes, np.where(similar_songs_indexes == track_index))
    similar_songs_indexes = similar_songs_indexes[:count] if select_smallest else similar_songs_indexes[::-1][:count]
    return data.iloc[similar_songs_indexes].copy()


def songs_as_dict(songs, include_fields):
    return songs[include_fields].to_dict(orient='index')


def get_closest_n(track_index, count):
    if track_index >= count//2 and track_index < songs_count-count//2:
        return pd.concat([data.iloc[track_index-count//2 : track_index], data.iloc[track_index+1 : track_index+count//2+1]])
    elif track_index < count//2:
        return data.head(count+1).drop(track_index)
    else:
        return data.tail(count+1).drop(track_index)
    

Getters for recommendation subsystems

In [None]:
def get_by_same_artist(track_index, count):
    return data[data['track_artist'] == data.iloc[track_index]['track_artist']].drop(track_index)[:count]


def get_lyrically_similar(track_index, count):
    return get_similar(track_index, count, lyric_similarity_matrix, False)


def get_energy_similar(track_index, count):
    return get_similar(track_index, count, energy_difference_matrix, True)


def get_mood_similar(track_index, count):
    return get_similar(track_index, count, mood_difference_matrix, True)


def get_random(count):
    return data.sample(count)


def get_released_around_same_time(track_index, count):
    return get_closest_n(track_index, count)

Recommendation subsytems

In [None]:
def recommend_by_same_artist(track_index, count, prioritisePopular):
    songs_by_same_artist = get_by_same_artist(track_index, count)
    songs_by_same_artist['recommendation_type'] = 'by same artist'
    return sort_by_popularity(songs_by_same_artist, prioritisePopular)


def recommend_lyrically_similar(track_index, count, prioritisePopular):
    similar_songs = get_lyrically_similar(track_index, count)
    similar_songs['recommendation_type'] = 'lyrically similar'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_energy_similar(track_index, count, prioritisePopular):
    similar_songs = get_energy_similar(track_index, count)
    similar_songs['recommendation_type'] = 'similar energy'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_mood_similar(track_index, count, prioritisePopular):
    similar_songs = get_mood_similar(track_index, count)
    similar_songs['recommendation_type'] = 'similar mood'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_released_around_same_time(track_index, count, prioritisePopular):
    contemporary_songs = get_released_around_same_time(track_index, count)
    contemporary_songs['recommendation_type'] = 'released around same time'
    return sort_by_popularity(contemporary_songs, prioritisePopular)


def recommend_random(count, prioritisePopular):
    random_songs = get_random(count)
    random_songs['recommendation_type'] = 'random'
    return sort_by_popularity(random_songs, prioritisePopular)

Hybrid recommendation system

In [None]:
def hybrid_recommend(track_index, count=6, prioritisePopular=True):
    by_same_artist = recommend_by_same_artist(track_index, count, prioritisePopular)
    lyrically_similar = recommend_lyrically_similar(track_index, count, prioritisePopular)
    energy_similar = recommend_energy_similar(track_index, count, prioritisePopular)
    mood_similar = recommend_mood_similar(track_index, count, prioritisePopular)
    random = recommend_random(count, prioritisePopular)
    released_around_same_time = recommend_released_around_same_time(track_index, count, prioritisePopular)
    all_recommendations = pd.concat([by_same_artist, lyrically_similar, energy_similar, mood_similar, random, released_around_same_time]).drop_duplicates()
    return songs_as_dict(all_recommendations, include_fields=['track_name', 'track_artist', 'recommendation_type'])

hybrid_recommend(4982)

In [None]:
# testing / visualization
data[data['track_artist'] == 'Taylor Swift']

# data.iloc[2274]

# sb.displot(data=data, x='track_album_release_date')
# data[data['track_artist'] == 'Queen'].head(30)
# sonic_data.describe()

# data['track_artist'].value_counts()