In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv("data_w_genres.csv")

In [7]:
df.columns

Index(['artists', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key', 'mode', 'count', 'genres'],
      dtype='object')

In [8]:
df_genres = df[['artists', 'popularity', 'genres']]
df_genres

Unnamed: 0,artists,popularity,genres
0,"""Cats"" 1981 Original London Cast",38.000000,['show tunes']
1,"""Cats"" 1983 Broadway Cast",33.076923,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",34.285714,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",34.444444,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",42.555556,[]
...,...,...,...
27616,鳳飛飛,35.000000,"['c-pop', 'classic mandopop', 'vintage chinese..."
27617,黃品源,47.000000,"['c-pop', 'classic cantopop', 'classic mandopop']"
27618,黃國隆,20.000000,[]
27619,黃蜀娟,23.000000,[]


In [9]:
df_genres['genres'] = df_genres['genres'].apply(lambda x: x[1:-1].split(', '))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres['genres'] = df_genres['genres'].apply(lambda x: x[1:-1].split(', '))


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer


# Binarise labels
mlb = MultiLabelBinarizer()
expandedLabelData = mlb.fit_transform(df_genres['genres'])
labelClasses = mlb.classes_


# Create a pandas.DataFrame from our output
expandedLabels = pd.DataFrame(expandedLabelData, columns=labelClasses)

In [11]:
expandedLabels

Unnamed: 0,Unnamed: 1,"""australian children's music""","""black 'n' roll""","""british children's music""","""canadian children's music""","""canzone d'autore""","""children's choir""","""children's folk""","""children's music""","""children's story""",...,'yugoslav rock','zapstep','zen','zhongguo feng','zimdancehall','zolo','zouglou','zouk riddim','zouk','zydeco'
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27618,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27619,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
final_df = pd.merge(df_genres.iloc[:,:2], 
                    expandedLabels.iloc[:,1:], 
                    left_index=True, 
                    right_index=True)

In [13]:
final_df

Unnamed: 0,artists,popularity,"""australian children's music""","""black 'n' roll""","""british children's music""","""canadian children's music""","""canzone d'autore""","""children's choir""","""children's folk""","""children's music""",...,'yugoslav rock','zapstep','zen','zhongguo feng','zimdancehall','zolo','zouglou','zouk riddim','zouk','zydeco'
0,"""Cats"" 1981 Original London Cast",38.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"""Cats"" 1983 Broadway Cast",33.076923,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""Fiddler On The Roof” Motion Picture Chorus",34.285714,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"""Fiddler On The Roof” Motion Picture Orchestra",34.444444,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",42.555556,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27616,鳳飛飛,35.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27617,黃品源,47.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27618,黃國隆,20.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27619,黃蜀娟,23.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's model

In [14]:
X = final_df.iloc[:,1:]
y = final_df.iloc[:,0]

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X, y)

KNeighborsClassifier(n_neighbors=3)

Example with User Input (including Popularity)

In [16]:
from IPython.core import display as ICD

artist_input = str(input('Give an artist : '))

results = knn.kneighbors(X = final_df.iloc[:,1:][final_df['artists'] == artist_input], 
                         n_neighbors=3, 
                         return_distance=True)

for i in range(1,3):
    print("\n")
    print(final_df.artists[results[1].tolist()[0][i]])
    
    output = final_df[(final_df['artists'] == artist_input) |(final_df['artists'] == final_df.artists[results[1].tolist()[0][i]])]
    output2 = output.T
    output2.columns = output2.iloc[0]
    output2 = output2.drop('artists', axis=0)
    ICD.display(output2.iloc[[0]])
    ICD.display(output2[(output2[artist_input] == 1) | (output2[final_df.artists[results[1].tolist()[0][i]]] == 1)])
    
    try: 
        from googlesearch import search 
    except ImportError:  
        print("No module named 'google' found") 
        
    query = df.artists[results[1].tolist()[0][i]]
    for j in search(query, tld="co.in", num=3, stop=3, pause=2): 
        print(j)
    print("\n")

Give an artist : Madonna


A


artists,A,Madonna
popularity,48,47.4449


artists,A,Madonna
'dance pop',1,1
'pop',1,1
'post-teen pop',1,0


https://en.wikipedia.org/wiki/A
https://en.wikipedia.org/wiki/A#History
https://en.wikipedia.org/wiki/A#Use_in_writing_systems




Selena


artists,Madonna,Selena
popularity,47.4449,46.5556


artists,Madonna,Selena
'dance pop',1,1
'pop',1,1
'post-teen pop',0,1


https://en.wikipedia.org/wiki/Selena
https://en.wikipedia.org/wiki/Selena_(film)
https://en.wikipedia.org/wiki/Murder_of_Selena




Example with User Input (Without Popularity)

In [45]:
X = final_df.iloc[:,2:]
y = final_df.iloc[:,0]

In [46]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X, y)

KNeighborsClassifier(n_neighbors=3)

In [55]:
from IPython.core import display as ICD

artist_input = str(input('Give an artist : '))

results = knn.kneighbors(X = final_df.iloc[:,2:][final_df['artists'] == artist_input], 
                         n_neighbors=3, 
                         return_distance=True)

for i in range(1,3):
    print("\n")
    print(final_df.artists[results[1].tolist()[0][i]])
    
    output = final_df[(final_df['artists'] == artist_input) |(final_df['artists'] == final_df.artists[results[1].tolist()[0][i]])]
    output2 = output.T
    output2.columns = output2.iloc[0]
    output2 = output2.drop('artists', axis=0)
    ICD.display(output2[(output2[artist_input] == 1) | (output2[final_df.artists[results[1].tolist()[0][i]]] == 1)])
    
    try: 
        from googlesearch import search 
    except ImportError:  
        print("No module named 'google' found") 
        
    query = df.artists[results[1].tolist()[0][i]]
    for j in search(query, tld="co.in", num=3, stop=3, pause=2): 
        print(j)
    print("\n")

Give an artist : Drake


NF


artists,Drake,NF
'canadian hip hop',1,0
'canadian pop',1,0
'hip hop',1,1
'pop rap',1,1
'rap',1,1
'toronto rap',1,0


https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTOIzFU05_Ok9wQDbCuDX0M93pAjRILpVGolO6mz3rJqUEvzj7i
https://en.wikipedia.org/wiki/NF_(rapper)
https://en.wikipedia.org/wiki/The_Search_(NF_album)




Kid Cudi


artists,Drake,Kid Cudi
'canadian hip hop',1,0
'canadian pop',1,0
'hip hop',1,1
'pop rap',1,1
'rap',1,1
'toronto rap',1,0


https://en.wikipedia.org/wiki/Kid_Cudi
https://en.wikipedia.org/wiki/Kid_Cudi_discography
https://en.wikipedia.org/wiki/A_Kid_Named_Cudi


