In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [6]:
# checking any null values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
# Checking any duplicated columns
df.duplicated().sum()

0

Removing 'link' column and taking 5000 random columns to train our model and finally reseting the index.

In [8]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [9]:
df.head(10)

Unnamed: 0,artist,song,text
0,Doobie Brothers,Excited,"Week by week, day by day \r\nYou got me chang..."
1,Johnny Cash,I Am A Pilgrim,I am a pilgrim and a stranger \r\nTraveling t...
2,Devo,Happy Guy,Well its an open road \r\nAnd its full of lif...
3,Frankie Goes To Hollywood,Warriors Of The Wasteland,From diamond mine to the factory \r\nEverybod...
4,Diana Ross,All For One,"All for one and one for all \r\nYou live, you..."
5,Whitesnake,Dancing Girls,"I don't need a doctor, I don't need a priest, ..."
6,Elton John,Big Dipper,Now I saw you talking to a cute little slip of...
7,Justin Bieber,One Life,"So girl \r\nYou just be honest with me, I kno..."
8,Frankie Goes To Hollywood,Born To Run,In the day we sweat it out in the streets of a...
9,Ingrid Michaelson,Handsome Hands,I think I'm the only one who really knows \r\...


In [11]:
# Showing the first column after sampling
df['text'][0]

"Week by week, day by day  \r\nYou got me changin' in so many ways  \r\nThinking of you i feel the call  \r\nSomebody get a doctor  \r\nI'm starting to fall  \r\n  \r\nI get excited don't you understand  \r\nI get excited 'cause it's part of the plan  \r\n  \r\nDay by day, hour by hour  \r\nI get a shiver from your lovin' power  \r\nFire in your eyes just lets me know  \r\nAin't no way i ever let go  \r\n  \r\nI get excited don't you understand  \r\nI get excited 'cause it's part of the plan  \r\n  \r\nHot love, what you bring to me  \r\nI get excited by the novelty  \r\nHot love, like a fever you feed  \r\nBlack lace and promise is all that i need  \r\nYou bring it on  \r\n  \r\nI love it baby, i love it all  \r\nAnytime, anytime you call  \r\nDay or night as the world moves on  \r\nIt's time for action, the talkin' is done\r\n\r\n"

Now checking the shape

In [12]:
df.shape

(5000, 3)

## Text Cleaning 

In [13]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

After cleaning the text 

In [14]:
df['text']

0       week by week, day by day  \r you got me changi...
1       i am a pilgrim and a stranger  \r traveling th...
2       well its an open road  \r and its full of life...
3       from diamond mine to the factory  \r everybody...
4       all for one and one for all  \r you live, you ...
                              ...                        
4995    sweet bonnie brown, looking like a baby  \r co...
4996    it would be easier to take the wet from water ...
4997    don't get any big ideas  \r they're not gonna ...
4998    there's a danger zone, not a stranger zone  \r...
4999    [intro]  \r yeah, yeah, yeh  \r them n-ggas is...
Name: text, Length: 5000, dtype: object

## Data Preprocessing

In [15]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

Convert all the tages to token

In [16]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

Explaining how it works

In [17]:
# Converted all similar words into a single word.
tokenization("You are beautiful, beauty")

'you are beauti , beauti'

#### Now applying to all of the text to convert it into Tokens. 

In [18]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

Converted into tokens

In [19]:
df['text']

0       week by week , day by day you got me changin '...
1       i am a pilgrim and a stranger travel through t...
2       well it an open road and it full of life but i...
3       from diamond mine to the factori everybodi 's ...
4       all for one and one for all you live , you giv...
                              ...                        
4995    sweet bonni brown , look like a babi come down...
4996    it would be easier to take the wet from water ...
4997    do n't get ani big idea they 're not gon na ha...
4998    there 's a danger zone , not a stranger zone t...
4999    [ intro ] yeah , yeah , yeh them n-gga is talk...
Name: text, Length: 5000, dtype: object

Convert into vectors and then finding the similarities

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')

In [22]:
matrix = tfidvector.fit_transform(df['text'])

In [23]:
matrix

<5000x17366 sparse matrix of type '<class 'numpy.float64'>'
	with 274390 stored elements in Compressed Sparse Row format>

Now finding the similarities

In [25]:
similarity = cosine_similarity(matrix)

In [26]:
# For the first song
similarity[0]

array([1.        , 0.01325554, 0.0532997 , ..., 0.01119195, 0.00229915,
       0.05300793])

## Recommendation System

In [27]:
df[df['song'] == 'One Life']

Unnamed: 0,artist,song,text
7,Justin Bieber,One Life,"so girl you just be honest with me , i know we..."


In [28]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:5]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

#### Getting the recommendations

In [29]:
recommendation('One Life')

['Numb', 'I Wanna Be Loved', 'Get Up', 'I Wanna Be']

#### Saving the files

In [21]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))