In [257]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

mySongs = pd.read_csv('./songdata.csv')
mySongs.head()


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [258]:
mySongs.shape

(57650, 4)

The dataset is big, so for the sake of speed, I will only be using 1000 rows. Also the link is not an important feature.

In [259]:
mySongs.head()
trimSongs = mySongs.sample(n=1000).drop('link', axis=1).reset_index(drop=True)
trimSongs.shape

(1000, 3)

\n linebreaks need to be trimmed from the text column

In [260]:
trimSongs['text'] = trimSongs['text'].replace(r'\n', '', regex=True)
trimSongs.head()

Unnamed: 0,artist,song,text
0,Bread,Blue Satin Pillow,"I'd like to take you home girl, and make you m..."
1,R. Kelly,Can You Feel It,Uhn clap your hands everybody Can you feel it...
2,Kirsty Maccoll,"Please Help Me, I'm Falling","Please help me, I'm falling in love with you ..."
3,System Of A Down,Snowblind,What you get and what you see Things that don...
4,John Prine,You Got Gold,Is there ever enough space between us To keep...


In [261]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
lyrics_matrix = tfidf.fit_transform(trimSongs['text'])
cosine_similarities = cosine_similarity(lyrics_matrix) 

In [262]:
from collections import defaultdict
similarities = defaultdict(list)
for i in range(len(cosine_similarities)):
    similar_indices = cosine_similarities[i].argsort()[-50:][::-1] 
    songName = trimSongs['song'].iloc[i]
    for j in range(1, len(similar_indices)):
         myIndex = similar_indices[j]
         mySims = cosine_similarities[i][myIndex]
         mySimSong = trimSongs['song'][myIndex]
         mySimArtist = trimSongs['artist'][myIndex]
         similarities[songName].append([mySims, mySimSong, mySimArtist])
print(similarities[trimSongs['song'].iloc[0]])

[[0.29679028700198706, 'Black Girl', 'Lenny Kravitz'], [0.273577244738358, "I'll Never Find Someone Like You", 'Backstreet Boys'], [0.25550900072717364, 'Body Shots', 'Chris Brown'], [0.2439313094300042, 'One Less Lonely Girl', 'Justin Bieber'], [0.21788297636155862, "It's Alright", 'Kiss'], [0.21541233554295205, "Ain't No Way (You Won't Love Me)", 'Chris Brown'], [0.19918514744510543, 'What Kind Of Girl', 'Air Supply'], [0.19588278943791365, 'He Is Your Brother', 'ABBA'], [0.18256412578797437, 'Satisfy You', 'Puff Daddy'], [0.17779251638187296, 'Bad Time', 'Counting Crows'], [0.1683302174913516, 'The Girl Is On To You', 'Bette Midler'], [0.16480775927264063, "I'll Never Break Your Heart", 'Backstreet Boys'], [0.1626953269483792, 'Make Love (Royalty)', 'Chris Brown'], [0.16079440419275914, 'Almost Like Love', 'Yes'], [0.15942366854515977, 'Skateaway', 'Dire Straits'], [0.1576310623103913, "Feelin' You In Stereo", 'R. Kelly'], [0.155038180227417, 'Love Is What We Make It', 'Kenny Rogers

In [263]:
#Example song 
mySong = trimSongs['song'].iloc[100]
print(f'The song name for this song: {mySong}')
print('The 5 recommended songs for this song are: ')
testSong = similarities[mySong]
for i in range(5):
    thisSong = testSong[i] 
    print(f"{i+1}: ")
    print(f"Similarity score: {thisSong[0]}, song name: {thisSong[1]}, song artist: {thisSong[2]}")
    print("-------------------------------------")




The song name for this song: Billy's Bones
The 5 recommended songs for this song are: 
1: 
Similarity score: 0.09785781071548152, song name: Good Old Days, song artist: P!nk
-------------------------------------
2: 
Similarity score: 0.08982070613195875, song name: Thursday, song artist: The Weeknd
-------------------------------------
3: 
Similarity score: 0.08509083195996053, song name: A Boy Named Sue, song artist: Johnny Cash
-------------------------------------
4: 
Similarity score: 0.07843325206152275, song name: A Sight For Sore Eyes, song artist: Tom Waits
-------------------------------------
5: 
Similarity score: 0.07706483110874492, song name: Sunday, Monday Or Always, song artist: Bing Crosby
-------------------------------------


Like this, we can find similar songs based on the lyric content sepcifically being similar using cosine similarity.