In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.shape

(57650, 4)

In [4]:
df.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/a/abba/ahes+my+kind+of+girl_20598417.html,I just came back from a lovely trip along the ...
freq,191,35,1,6


In [14]:
df[df.song=='Careless Whisper']

Unnamed: 0,artist,song,link,text
33754,George Michael,Careless Whisper,/g/george+michael/careless+whisper_20059228.html,I feel so unsure \r\nAs I take your hand and ...
56084,Wham!,Careless Whisper,/w/wham/careless+whisper_20146061.html,I feel so unsure \r\nAs I take your hand and ...


In [11]:
df=df.sample(5000).drop('link',axis='columns')
df

Unnamed: 0,artist,song,text
14334,Noa,Drive My Car,Asked a girl what she wanted to be \r\nShe to...
22263,Young Jeezy,Last Of A Dying Breed,[Verse 1 - Young Jeezy] \r\nWelcome to the li...
57121,Yello,Dr. Van Steiner,I can't separate your fiction from fact \r\nY...
6027,Fiona Apple,Jonathan,"Jonathan, call again \r\nTake me to Coney Isl..."
6351,Frank Sinatra,Barbara,"Where there is sunrise, there is Barbara, only..."
...,...,...,...
36785,Indigo Girls,Come A Long Way,I've come a long way \r\nI was a show on ice ...
11135,Lady Gaga,Starstruck,Groove. slam. work it back. filter that. baby ...
35272,Hank Williams Jr.,Games People Play,Oh the games people play now \r\nEvery night ...
46944,Opeth,Under The Weeping Moon,Once again I've cried \r\nUnto the moon \r\n...


text preproccesing

In [12]:
df['text']=df['text'].str.lower().replace(r'\w\s',' ').replace(r'\n',' ',regex=True)
df

Unnamed: 0,artist,song,text
14334,Noa,Drive My Car,asked a girl what she wanted to be \r she tol...
22263,Young Jeezy,Last Of A Dying Breed,[verse 1 - young jeezy] \r welcome to the lif...
57121,Yello,Dr. Van Steiner,i can't separate your fiction from fact \r yo...
6027,Fiona Apple,Jonathan,"jonathan, call again \r take me to coney isla..."
6351,Frank Sinatra,Barbara,"where there is sunrise, there is barbara, only..."
...,...,...,...
36785,Indigo Girls,Come A Long Way,i've come a long way \r i was a show on ice ...
11135,Lady Gaga,Starstruck,groove. slam. work it back. filter that. baby ...
35272,Hank Williams Jr.,Games People Play,oh the games people play now \r every night a...
46944,Opeth,Under The Weeping Moon,once again i've cried \r unto the moon \r th...


In [13]:
import nltk 
from nltk.stem.porter import PorterStemmer

In [14]:
stemmer = PorterStemmer()

In [15]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a=[stemmer.stem(w) for w in token]
    return ' '.join(a)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
df['text']=df['text'].apply(token)

In [18]:
tfid=TfidfVectorizer(analyzer='word',stop_words='english')

In [19]:
matrix=tfid.fit_transform(df['text'])

In [20]:
similar = cosine_similarity(matrix)

In [21]:
similar[1]

array([0.08059529, 1.        , 0.00975392, ..., 0.113097  , 0.03925294,
       0.05764855])

In [22]:
df.reset_index(drop=True, inplace=True)
k = len(df)  # Ensure k matches the number of rows in df
df.index = range(1, k + 1)

In [23]:
df[df['song']=='Let There Be Love'].index[0]

2317

Recommender function

In [24]:
import numpy as np

def recommender(song_name):
    if song_name not in df['song'].values:
        return "Song not found in the database."
    
    idx = df[df['song'] == song_name].index[0]
    
    if idx >= len(similar):
        return "Index out of bounds in similarity matrix."
    
    # Ensure similar[idx] is a 1D array or list
    if isinstance(similar[idx], (np.ndarray, list)):
        similar_row = np.array(similar[idx]).flatten()  # Flatten to 1D if it's not already 1D
        distance = sorted(list(enumerate(similar_row)), reverse=True, key=lambda x: x[1])
    else:
        return "similar[idx] is not a valid list or array."
    
    song = []
    
    for s_id in distance[1:5]:  # Skipping the first element, which is the same song
        try:
            song.append(df.iloc[s_id[0]].song)
        except IndexError as e:
            return f"IndexError: {e} - Invalid index in distance list"
    
    return song


In [25]:
recommender("Let There Be Love")

["All I've Got To Do",
 'We Gotta Get Out Of Here',
 'Sugarcane',
 'You Better Ask']

In [26]:
import pickle

In [27]:
pickle.dump(similar,open("similarity","wb"))

In [28]:
pickle.dump(df,open("df","wb"))