In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spotify_songs.csv')

In [3]:
df

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df = df.drop('link', axis = 1).reset_index(drop=True)

In [8]:
df = df.sample(10000, random_state=42).reset_index(drop=True)

In [9]:
df.shape

(10000, 3)

In [10]:
df['text']

0       Like to have you 'round  \r\nWith all the lies...
1       This Little Light of Mine (Light of Mine),  \r...
2       She says she's no good with words but I'm wors...
3       Hey mama, mama, come a look at sister,  \r\nSh...
4       I see it all through my window it seems.  \r\n...
                              ...                        
9995    You know I'm hotblooded, baby...  \r\n  \r\nGe...
9996    You're the end of the rainbow, my pot of gold ...
9997    Artist: Raw Theme Lyrics  \r\nSong: Across The...
9998    I wonder who's sleeping in your sheets tonight...
9999    When you're driving down the highway at night ...
Name: text, Length: 10000, dtype: object

## Text Preprocessing

In [11]:
df['text'].str.lower().replace(r'^\w\s',' ').replace(r'\n', ' ', regex = True)

0       like to have you 'round  \r with all the lies ...
1       this little light of mine (light of mine),  \r...
2       she says she's no good with words but i'm wors...
3       hey mama, mama, come a look at sister,  \r she...
4       i see it all through my window it seems.  \r n...
                              ...                        
9995    you know i'm hotblooded, baby...  \r   \r get ...
9996    you're the end of the rainbow, my pot of gold ...
9997    artist: raw theme lyrics  \r song: across the ...
9998    i wonder who's sleeping in your sheets tonight...
9999    when you're driving down the highway at night ...
Name: text, Length: 10000, dtype: object

In [12]:
import nltk
from nltk.stem.porter import PorterStemmer

In [13]:
nltk.download('punkt')
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sahga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [15]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [16]:
df['text'] = df['text'].apply(lambda x: token(x))

In [17]:
df.head()

Unnamed: 0,artist,song,text
0,Wishbone Ash,Right Or Wrong,like to have you 'round with all the lie that ...
1,Aerosmith,This Little Light Of Mine,"thi littl light of mine ( light of mine ) , I ..."
2,Fall Out Boy,"Dance, Dance",she say she 's no good with word but I 'm wors...
3,Janis Joplin,Easy Rider,"hey mama , mama , come a look at sister , she ..."
4,Moody Blues,Peak Hour,I see it all through my window it seem . never...


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [19]:
vector = TfidfVectorizer(analyzer = 'word', stop_words='english')

In [20]:
matrix = vector.fit_transform(df['text'])

In [21]:
similar = cosine_similarity(matrix)

In [22]:
similar[0]

array([1.        , 0.        , 0.03540184, ..., 0.00999171, 0.05348343,
       0.04432875])

In [23]:
 df[df['song']=='Waiting For The Man'].index[0]

1593

In [24]:
def recommender(song_name):
    index = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(similar[index])),reverse = True, key = lambda x:x[1])
    song=[]
    for s_id in distance[1:6]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [25]:
recommender("Waiting For The Man")

['The Wait',
 'All My Life',
 'Waiting For Girl Like You',
 "I'm Still Waiting",
 'Hate']

In [26]:
import pickle

In [27]:
pickle.dump(similar,open("musics_similarity.pkl","wb"))

In [29]:
pickle.dump(df,open("df_musics.pkl","wb"))