In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("spotify_millsongdata.csv")

In [5]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [6]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [7]:
df.shape

(57650, 4)

In [8]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [9]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [10]:
df.head(10)

Unnamed: 0,artist,song,text
0,Alice Cooper,Department Of Youth,We're in trouble all the time \r\nYou read ab...
1,John McDermott,The Secret Of Christmas,It's not the glow you feel \r\nWhen snow appe...
2,Inna,Sun Is Up,All the people tonight \r\nPut your hands in ...
3,Phil Collins,You'll Be In My Heart,Come stop your crying \r\nIt will be alright ...
4,Yellowcard,Southern Air,I've watched the world go by \r\nOutside a wi...
5,Erasure,Sunday Girl,Blinded by the vision \r\nI turn and face my ...
6,Uncle Tupelo,Before I Break,On liquor I spend my last dime \r\n \r\nSund...
7,Arrogant Worms,Malcolm,Billy solves his problems by calling up his mo...
8,Katy Perry,Crocodile Tears,"[Intro] \r\nOh, when we go numb \r\nOh, when..."
9,Dolly Parton,Applejack,(Dolly Parton) \r\nHe lived by the apple orch...


In [11]:
df['text'][0]

"We're in trouble all the time  \r\nYou read about us all in the papers  \r\nWe walk around and bump into walls - a blind delegation  \r\nAnd we ain't afraid of high power  \r\nWe're bullet proof  \r\nAnd we've never heard of Eisenhower  \r\nMissile power, justice or truth  \r\n  \r\nWe're the Department of Youth  \r\nYour new Department of Youth  \r\nWe're the Department of Youth  \r\nJust me and youth  \r\n  \r\nWe talk about this whole stupid world  \r\nAnd still come out laughing  \r\nWe never make any sense  \r\nBut hell that never mattered  \r\nBut we'll make it through our blackest hour  \r\nWe're living proof  \r\nAnd we've never heard of Billy Sunday  \r\nDamon Runyon, manners or couth  \r\nWe're the Department of Youth  \r\nYour new Department of Youth  \r\nWe're the Department of Youth  \r\nJust me and youth  \r\n  \r\nWe're the Department of Youth  \r\nThe new Department of Youth  \r\nWe're the Department of Youth  \r\nWe've got the power  \r\nWe're the Department of Youth 

In [12]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [13]:
df['text'] = df['text'].str.lower().str.replace(r'\s+', ' ', regex=True).str.strip()

In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [15]:
stemmer = PorterStemmer()

In [16]:
def token(txt):
    token = nltk.word_tokenize(txt, preserve_line=True)
    stemming = [stemmer.stem(w) for w in token]
    return " ".join(stemming)

In [17]:
df['text'] = df['text'].apply(lambda x: token(x))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
tfidvector = TfidfVectorizer(analyzer='word', stop_words='english')

In [20]:
matrix = tfidvector.fit_transform(df['text'])

In [21]:
similarity = cosine_similarity(matrix)

In [22]:
similarity[0]

array([1.        , 0.00772689, 0.01811737, ..., 0.00773648, 0.01416255,
       0.00992895], shape=(5000,))

In [24]:
df[df['song'] == 'Before I Break']

Unnamed: 0,artist,song,text
6,Uncle Tupelo,Before I Break,"on liquor i spend my last dime sunday morn , 8..."


Recommander Function

In [25]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [26]:
recommendation('Before I Break')

["Can't We Talk It Over?",
 'Thank You',
 'Give Thanks',
 "Can't Stop Thinking About You",
 'I Thank You',
 'Two Rights',
 'Pictures In My Head',
 "Can't Live Without You",
 'Drink New Blood',
 'I Got Drunk',
 'I Want To Spend The Night',
 "Just Can't Get Enough",
 "I Can't Breakaway",
 "You're So Fine",
 'In A Word',
 "Can't Get Enough",
 'Angel Dream',
 'A Moment Suspended In Time',
 'Think Of You',
 'Living Like A Legend']

In [27]:
import pickle
pickle.dump(similarity, open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))