In [13]:
import numpy as np
import pandas as pd

In [14]:
# loading the data from the csv file to apandas dataframe
songs_data = pd.read_csv('Top Hits Spotify from 2000-2019.csv')

In [15]:
# printing the first 5 rows of the dataframe
songs_data.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age,index,index.1
0,14675,frankie j,how to deal,2003,pop,choose doesnt wanna live life wife pursue life...,77,0.000835,0.000835,0.242045,...,0.744395,0.697408,0.058634,0.0,0.640354,0.49548,obscene,0.242857,5068,107
1,14676,blu cantrell,breathe,2003,pop,hood lookin good course kors purse choo boot t...,149,0.000462,0.302297,0.000462,...,0.799632,0.815963,0.048192,0.0,0.462077,0.663653,violence,0.242857,5069,108
2,14677,the jayhawks,save it for a rainy day,2003,pop,pretty little hairdo disguise live look like t...,61,0.001224,0.192401,0.321653,...,0.624174,0.823526,0.267068,0.0,0.703215,0.763756,world/life,0.242857,5070,109
3,14682,brand new,guernica,2003,pop,young word word worry ones phone receive body ...,63,0.001012,0.321647,0.068682,...,0.493122,0.75207,0.112449,5e-06,0.382729,0.91291,violence,0.242857,5071,110
4,14688,american hi-fi,the art of losing,2003,pop,outta time single stand underdog modern world ...,96,0.000721,0.148926,0.000721,...,0.403228,0.850114,0.001726,0.000463,0.669209,0.904902,obscene,0.242857,5072,111


In [16]:
# number of rows and columns in the data frame
songs_data.shape

(26553, 33)

In [17]:
songs_data = songs_data[['artist_name','track_name','genre','lyrics','topic','index.1']]

In [18]:
songs_data.head()

Unnamed: 0,artist_name,track_name,genre,lyrics,topic,index.1
0,frankie j,how to deal,pop,choose doesnt wanna live life wife pursue life...,obscene,107
1,blu cantrell,breathe,pop,hood lookin good course kors purse choo boot t...,violence,108
2,the jayhawks,save it for a rainy day,pop,pretty little hairdo disguise live look like t...,world/life,109
3,brand new,guernica,pop,young word word worry ones phone receive body ...,violence,110
4,american hi-fi,the art of losing,pop,outta time single stand underdog modern world ...,obscene,111


In [19]:
# selecting the relevant features for recommendation

selected_features = ['artist_name','track_name','genre','lyrics','topic','index.1']
print(selected_features)

['artist_name', 'track_name', 'genre', 'lyrics', 'topic', 'index.1']


In [20]:
# replacing the null valuess with null string

for feature in selected_features:
  songs_data[feature] = songs_data[feature].fillna('')

In [21]:
# Missing Value?
# if yes; songs_data.dropna(inplace=True)
songs_data.isnull().sum()

artist_name    0
track_name     0
genre          0
lyrics         0
topic          0
index.1        0
dtype: int64

In [22]:
# Check duplicated data rows? 
songs_data.duplicated().sum()

0

In [23]:
songs_data.drop_duplicates(inplace=True)


In [24]:
songs_data.duplicated().sum()

0

In [25]:
songs_data.head()

Unnamed: 0,artist_name,track_name,genre,lyrics,topic,index.1
0,frankie j,how to deal,pop,choose doesnt wanna live life wife pursue life...,obscene,107
1,blu cantrell,breathe,pop,hood lookin good course kors purse choo boot t...,violence,108
2,the jayhawks,save it for a rainy day,pop,pretty little hairdo disguise live look like t...,world/life,109
3,brand new,guernica,pop,young word word worry ones phone receive body ...,violence,110
4,american hi-fi,the art of losing,pop,outta time single stand underdog modern world ...,obscene,111


In [26]:
# combining all the 4 selected features

songs_data['total'] = songs_data['artist_name'] + songs_data['genre'] + songs_data['lyrics'] + songs_data['topic']

In [27]:
songs_data = songs_data.drop(columns=['artist_name','genre','lyrics','topic'])
songs_data.head()

Unnamed: 0,track_name,index.1,total
0,how to deal,107,frankie jpopchoose doesnt wanna live life wife...
1,breathe,108,blu cantrellpophood lookin good course kors pu...
2,save it for a rainy day,109,the jayhawkspoppretty little hairdo disguise l...
3,guernica,110,brand newpopyoung word word worry ones phone r...
4,the art of losing,111,american hi-fipopoutta time single stand under...


In [28]:
pip install nltk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [29]:
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

nltk.data.path.append('/path/to/nltk_data')
from nltk.stem import PorterStemmer

In [30]:
ps = PorterStemmer()

In [31]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [32]:
songs_data['total'] = songs_data['total'].apply(stem)

In [33]:
songs_data.head()

Unnamed: 0,track_name,index.1,total
0,how to deal,107,franki jpopchoos doesnt wanna live life wife p...
1,breathe,108,blu cantrellpophood lookin good cours kor purs...
2,save it for a rainy day,109,the jayhawkspoppretti littl hairdo disguis liv...
3,guernica,110,brand newpopyoung word word worri one phone re...
4,the art of losing,111,american hi-fipopoutta time singl stand underd...


In [34]:
# converting the text data to feature vectors
from sklearn.feature_extraction.text import TfidfVectorizer


In [35]:
feature_vectors = TfidfVectorizer().fit_transform(songs_data['total'])

In [36]:
print(feature_vectors)

  (0, 37181)	0.12372833287924322
  (0, 67810)	0.10386573956913253
  (0, 25181)	0.040598537688119544
  (0, 26208)	0.05477455134982689
  (0, 59480)	0.0431477026923744
  (0, 37125)	0.14299892144925494
  (0, 58234)	0.07646648543908756
  (0, 39429)	0.1124184594600895
  (0, 23506)	0.11613492320752562
  (0, 24774)	0.04759485426237633
  (0, 62869)	0.03441989330909279
  (0, 45707)	0.048125782559104116
  (0, 37484)	0.09787191506841088
  (0, 12603)	0.02798259914491791
  (0, 28633)	0.07105205743413706
  (0, 56303)	0.06336169684507487
  (0, 41166)	0.048609910370387706
  (0, 30860)	0.09141315956328908
  (0, 27636)	0.03256094840069763
  (0, 34486)	0.09713485036273328
  (0, 33992)	0.05104567179258961
  (0, 30159)	0.05209213125613699
  (0, 54260)	0.03862964513129505
  (0, 58293)	0.06819564617031423
  (0, 27648)	0.1355655713213182
  :	:
  (26543, 3721)	0.6678677088474166
  (26543, 26319)	0.5730333298306336
  (26543, 36286)	0.474958447005755
  (26544, 26320)	1.0
  (26545, 63159)	0.7230621438222928
  (265

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [39]:
similarity

array([[1.        , 0.01623531, 0.00672317, ..., 0.        , 0.        ,
        0.        ],
       [0.01623531, 1.        , 0.00217149, ..., 0.        , 0.        ,
        0.        ],
       [0.00672317, 0.00217149, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [40]:
print(similarity.shape)

(26553, 26553)


### Recommendation

In [30]:
#def recommend(song):
 #   index=songs_data[songs_data['total']==song].index[0]
  #  distances = sorted(list(enumerate(similarity[index])), reverse =True, key = lambda x:x[1])
    
   # for i in distances[1:6]:
    #    print(songs_data.iloc[i[0]].title)  
    

In [41]:
#recommend("hello")

In [42]:
import difflib


In [43]:
# getting the song name from the user
#song_name = input(' Enter your favourite song name : ')

 Enter your favourite song name : no one


In [44]:
# creating a list with all the songs names given in the dataset

#list_of_all_titles = songs_data['track_name'].tolist()
#print(list_of_all_titles)



In [45]:
# finding the close match for the song name given by the user

#find_close_match = difflib.get_close_matches(song_name, list_of_all_titles)
#print(find_close_match)

['no one', 'no phone', 'no money']


In [46]:
#close_match = find_close_match[0]
#print(close_match)

no one


In [47]:
# finding the index of the song with title

#index_of_the_song = songs_data[songs_data.track_name == close_match]['index.1'].values[0]
#print(index_of_the_song)

10303


In [48]:
song_name = input(' Enter your favourite song name : ')

list_of_all_titles = songs_data['track_name'].tolist()

find_close_match = difflib.get_close_matches(song_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_song = songs_data[songs_data.track_name == close_match]['index.1'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_song]))

sorted_similar_songs = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Songs suggested for you : \n')

i = 1

for song in sorted_similar_songs:
  index = song[0]
  title_from_index = songs_data[songs_data.index==index]['track_name'].values[0]
  if (i<10):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite song name : no one
Songs suggested for you : 

1 . lipstick, powder & paint
2 . lipstick
3 . portrait of a fool
4 . lipstick
5 . ain't worth the powder
6 . redefine
7 . barnacles
8 . infinite regression
9 . life


In [49]:
import pickle

In [51]:
pickle.dump(songs_data, open('C:/Users/fz.hannou/Desktop/Project IA_Music_Recommender_System/model/songs_list.pkl','wb'))

In [52]:
pickle.dump(similarity, open('C:/Users/fz.hannou/Desktop/Project IA_Music_Recommender_System/model/similarity.pkl','wb'))