In [42]:
import pandas as pd

In [43]:
df = pd.read_csv("spotify_millsongdata.csv") # Import the dataset

In [44]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [45]:
df.shape # Cheack the size of dataset 

(57650, 4)

In [46]:
df.isnull().sum()  # check if columns are empty or not 

artist    0
song      0
link      0
text      0
dtype: int64

In [47]:
df =df.sample(20000).drop('link', axis=1).reset_index(drop=True) # Rmove the links we dont want tham 

In [48]:
df.head(10)

Unnamed: 0,artist,song,text
0,Diana Ross,I've Got A Crush On You,How glad the many millions \r\nOf Toms and Di...
1,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
2,Widespread Panic,Expiration Day,I'm a machinist at the Springfield Armory \r\...
3,Bette Midler,Make Yourself Comfortable,I've got some records here \r\nTo put you in ...
4,Frankie Laine,Mam'selle,It was Montmartre \r\nIt was midnight Come to...
5,Scorpions,Polar Nights,"Down, down \r\nThat's bringing me down \r\nT..."
6,Bette Midler,Let It Snow! Let It Snow! Let It Snow!,"Oh the weather outside is frightful, \r\nBut ..."
7,Counting Crows,Carriage,If anything it should have been a better thing...
8,Cher,It Gets Me Where I Want To Go,It gets me where I want to go \r\n \r\nIt ge...
9,Elton John,Hercules,Ooh I got a busted wing and a hornet sting \r...


In [49]:
df['text'][0]

"How glad the many millions  \r\nOf Toms and Dicks and Williams  \r\nWould be  \r\nTo capture me  \r\n  \r\nBut you had such persistence  \r\nYou wore down my resistance  \r\nI fell  \r\nAnd it was swell  \r\n  \r\nYou're my big and brave and handsome Romeo  \r\nHow I won you I will never never know  \r\nIt's not that you're attractive  \r\nBut oh my heart grew active  \r\nWhen you came into view  \r\n  \r\nI've got a crush on you, sweetie pie  \r\nAll the day and nighttime, hear me sigh  \r\nI never had the least notion  \r\nThat I could fall with so much emotion  \r\nTaken from  \r\nCould you coo  \r\nCould you care  \r\nFor a cunning cottage we could share  \r\nThe world will pardon my mush  \r\nCause I've got a crush, my baby, on you  \r\n  \r\nCould you coo  \r\nCould you care  \r\nFor a cunning cottage we could share  \r\nThe world will pardon my mush  \r\nCause I have got a crush,  \r\nMy baby, on you\r\n\r\n"

In [50]:
df.shape

(20000, 3)

In [51]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True) #Text cleaning and removing /r and text 

In [52]:
df.tail() #Show last 5 rows

Unnamed: 0,artist,song,text
19995,Modern Talking,It's Your Smile,"i cry the whole night, just for you \r my tea..."
19996,Ingrid Michaelson,Be OK,"i just want to be okay, be okay, be okay \r i..."
19997,Randy Travis,Highway Junkie,"\r a hundred cups of coffee, five hundred ci..."
19998,Ian Hunter,Big Time,(ian hunter) \r one more town on the merry-go...
19999,Pogues,Love You 'till The End,i just want to see you \r when you're all alo...


In [53]:
# Tokenization 
import nltk  
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
stemmer = PorterStemmer()

In [55]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [56]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [60]:
similarity[0]

array([1.        , 0.01483385, 0.00679951, ..., 0.02063201, 0.03535127,
       0.0191452 ])

In [64]:
df[df['song'] == 'Highway Junkie']

Unnamed: 0,artist,song,text
19997,Randy Travis,Highway Junkie,"a hundr cup of coffe , five hundr cigarett a t..."


## RECOMMENDER FUNCTION 

In [71]:
def recommendation(song_df):
    # Check if the song exists in the DataFrame
    if song_df not in df['song'].values:
        return "Song not found in the dataset"
    
    # Get the index of the song
    idx = df[df['song'] == song_df].index[0]
    
    # Check if the index is valid
    if idx >= len(similarity):
        return "Invalid index"
    
    # Sort similarities and get top 20 similar songs
    distances = sorted(enumerate(similarity[idx]), reverse=True, key=lambda x: x[1])
    songs = [df.iloc[m_id[0]].song for m_id in distances[1:21]]
    
    return songs


In [72]:
recommendation('Highway Junkie')

['Road Beneath My Wheels',
 'Heading Out To The Highway',
 'Give Me The Highway',
 'Highway To Hell',
 'Rub Me Raw',
 'Junkie',
 'Burn Out',
 'Radio',
 "Highway Don't Care",
 'Key To The Highway',
 "I've Got A Name",
 'Nothing Like A Hundred Miles',
 "I'm Gonna Be A Wheel Someday",
 'American Beauty',
 "I Won't Cry For You",
 'Key To The Highway',
 'Wheel Of Fortune',
 'Real Thing',
 "It's My Life",
 'On The Line']

In [74]:
# storing the file
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))