In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("spotify_millsongdata.csv")

In [7]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [8]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [9]:
df.shape

(57650, 4)

In [10]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [11]:
df = df.sample(5000) 
if 'link' in df.columns:  
    df = df.drop('link', axis=1)  
df = df.reset_index(drop=True) 

In [12]:
df.head(5)

Unnamed: 0,artist,song,text
0,Elton John,Ball And Chain,I got a ball and chain hanging around my heart...
1,Everclear,Nervous And Weird,You know I want to be the way you want me to ...
2,Jason Mraz,If It Kills Me,"Hello, tell me you know \r\nYeah, you figured..."
3,Uncle Tupelo,Train,"A quarter after two \r\nSittin' in my car, wa..."
4,Donna Summer,Our Love,"Dropping you this line to give you peace, \r\..."


In [13]:
df['text'][0]

"I got a ball and chain hanging around my heart  \r\nYou were the one to blame for tearing my world apart  \r\nI got a heart so true, you got a heart of ice  \r\nA little more love from you, it could have been paradise  \r\n  \r\nI got a ball and chain hanging around my heart  \r\nAnd if it's all the same maybe we should part  \r\nAnd I'm singing do do do...  \r\nI'm singing do do do...  \r\n  \r\nYou had to tie me down inside a cage of doubt  \r\nI'm sick of being kicked around so this is where I get out  \r\n  \r\nOh oui baby you tried to hold me but you were slowly  \r\nDriving me insane  \r\nOh oui baby  \r\n  \r\nI got a ball and chain hanging around my heart  \r\nYou were the one to blame for tearing my world apart  \r\nI got a heart so true, you got a heart of ice  \r\nA little more love from you, it could have been paradise  \r\n  \r\nI couldn't take your pain, you couldn't take my love  \r\nSo I'm gonna quit this game 'cause baby I've had enough\r\n\r\n"

In [14]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [15]:
df['text'] = df['text'].str.lower().replace(r'^\s+', ' ', regex=True)

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def token(txt):
    tokenized = word_tokenize(txt) 
    stemmed = [stemmer.stem(w) for w in tokenized] 
    return " ".join(stemmed)

print(token('you are beautiful, beauty'))

you are beauti , beauti


In [17]:
df['text'].apply(lambda x: token(x))

0       i got a ball and chain hang around my heart yo...
1       you know i want to be the way you want me to b...
2       hello , tell me you know yeah , you figur me o...
3       a quarter after two sittin ' in my car , watch...
4       drop you thi line to give you peac , and to se...
                              ...                        
4995    there were night when the wind wa so cold that...
4996    [ vers 1 ] realiti suck , too much pain i ca n...
4997    here come old flat top he come groovin ' up sl...
4998    it delici the way your walk go easi on your to...
4999    say it 's alright joe say it 's alright joe , ...
Name: text, Length: 5000, dtype: object

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [20]:
similarity[0]

array([1.        , 0.00811853, 0.04564859, ..., 0.1137758 , 0.00397801,
       0.01280361])

In [25]:
matching_rows = df[df['song'] == 'Ball And Chain']

if not matching_rows.empty:
    idx = matching_rows.index[0]
    print(f"Index of the song: {idx}")
else:
    print("Song 'waiting for the man' not found in the dataset")

Index of the song: 0


In [22]:
def recommendation(song_df):
    if song_df in df['song'].values:
        idx = df[df['song'] == song_df].index[0]
        
        distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
        
        songs = []
        
        for i in distances[1:8]:
            song = df.iloc[i[0]].song
            songs.append(song)
        
        return songs
    else:
        return f"Song '{song_df}' not found in the dataset"

In [26]:
recommendation('Ball And Chain')

['Tearing And Breaking',
 'Lose Again',
 'I Got You',
 'Anything',
 'Pain In My Heart',
 'Key 2 Your Heart',
 'You Put This Love In My Heart']

In [24]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))