In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
df['text'][0]

"i am stopping by to tell you   i'll be leaving here for good,   to go back to where i came from,   like you always said i should.   i would be a liar if i said that i was fine,   'cause you opened up my head   and you lost some of my mind.      i've been a fool.   i've been full of you.   i've been lying here so long i don't know who i'm lying to.   but i'm set on fire   in spite of you.   i will burn another bridge and hope the river burns up too.   when i'm gone,   you'll be gone too.      our hands are stained the color of the sky above our heads,   with things we haven't done   and words we haven't said.   still the pain will come as some surprise.   when you burn a bridge the smoke gets in your eyes.      no, i've been a fool.   i've been full of you.   i've been lying here so long i don't know who i'm lying to.   but i'm set on fire   in spite of you.   i will burn another bridge and hope the river burns up too.   when i'm gone,   you'll be gone too.      the ashes blow away,   

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [15]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [18]:
similarity[0]

array([1.        , 0.17020727, 0.14450938, ..., 0.05107118, 0.034936  ,
       0.00731841])

In [24]:
df['song'][0]

'Bridge Burning'

In [35]:
# df[df['song']==''].index[0]

# recommedation function

In [30]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [31]:
recommendation('Alma Mater')

['Learn To Say Goodbye',
 'Girl Goodbye',
 'Goodbye Stranger',
 "I'll Never Say Goodbye",
 'Ggodbye Angel',
 'Sitting In The Window Of My Room',
 "I'll Be Back Someday",
 'High School Musical',
 'Gone Crazy',
 'Everybody Knows That You Are Insane',
 "That's My Impression",
 'Lately',
 "Everybody's Crazy",
 'Looking For You',
 'Miss The Lights',
 'Miss Me Blind',
 'Learning How To Smile',
 'A Goodbye Joke',
 'Kiss And Say Goodbye',
 'Good Night']

In [37]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))

In [34]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True