In [141]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial import distance
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("SpotifyFeatures.csv")
data.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Opera,Giuseppe Verdi,"Stiffelio, Act III: Ei fugge! … Lina, pensai c...",7EsKYeHtTc4H4xWiTqSVZA,21,0.986,0.313,490867,0.231,0.000431,C#,0.0964,-14.287,Major,0.0547,86.001,4/4,0.0886
1,Opera,Giacomo Puccini,Madama Butterfly / Act 1: ... E soffitto e pareti,7MfmRBvqaW0I6UTxXnad8p,18,0.972,0.36,176797,0.201,0.028,D#,0.133,-19.794,Major,0.0581,131.798,4/4,0.369
2,Opera,Giacomo Puccini,"Turandot / Act 2: Gloria, gloria, o vincitore",7pBo1GDhIysyUMFXiDVoON,10,0.935,0.168,266184,0.47,0.0204,C,0.363,-8.415,Major,0.0383,75.126,3/4,0.0696
3,Opera,Giuseppe Verdi,"Rigoletto, Act IV: Venti scudi hai tu detto?",02mvYZX5aKNzdqEo6jF20m,17,0.961,0.25,288573,0.00605,0.0,D,0.12,-33.44,Major,0.048,76.493,4/4,0.038
4,Opera,Giuseppe Verdi,"Don Carlo / Act 4: ""Ella giammai m'amò!""",03TW0jwGMGhUabAjOpB1T9,19,0.985,0.142,629760,0.058,0.146,D,0.0969,-23.625,Major,0.0493,172.935,4/4,0.0382


In [142]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228159 entries, 0 to 228158
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             228159 non-null  object 
 1   artist_name       228159 non-null  object 
 2   track_name        228159 non-null  object 
 3   track_id          228159 non-null  object 
 4   popularity        228159 non-null  int64  
 5   acousticness      228159 non-null  float64
 6   danceability      228159 non-null  float64
 7   duration_ms       228159 non-null  int64  
 8   energy            228159 non-null  float64
 9   instrumentalness  228159 non-null  float64
 10  key               228159 non-null  object 
 11  liveness          228159 non-null  float64
 12  loudness          228159 non-null  float64
 13  mode              228159 non-null  object 
 14  speechiness       228159 non-null  float64
 15  tempo             228159 non-null  float64
 16  time_signature    22

## Normalization and One Hot Encoding

In [143]:
# Tidying data for normalization
data = data.drop(["track_id","key","mode","time_signature"],1)
df = data.copy()

# what we are trying to predict (our y)
df = df.drop(["artist_name","track_name"],1)

# our predictors/categories/classes 
col = ['popularity', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence']

# Normalizing the categories - col variable ^^^

# z score
# The standard score of a sample x is calculated as:
#     z = (x - u) / s
# how far our training set/prediction is from the mean (standard deviations away)
# 'Template like' z score transformation
scaler = StandardScaler()
df[col] = scaler.fit_transform(df[col])

# One Hot Encoding for each category
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")


# Predicting the categories based on genre
enc = pd.DataFrame(encoder.fit_transform(np.array(df["genre"]).reshape(-1,1)))
enc.columns = df["genre"].unique()
enc.head()

Unnamed: 0,Opera,A Capella,Alternative,Blues,Dance,Pop,Electronic,R&B,Children’s Music,Folk,...,Country,Reggaeton,Ska,Indie,Rock,Soul,Soundtrack,Jazz,World,Movie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
# were able to create predictions related to how far each individual prediction is from our mean/original training set (std away)
df[enc.columns] = enc
df = df.drop("genre",1)
df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,Country,Reggaeton,Ska,Indie,Rock,Soul,Soundtrack,Jazz,World,Movie
0,-1.343388,1.806566,-1.311222,2.179132,-1.343051,-0.468048,-0.600265,-0.830223,-0.36369,-1.023107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.517034,1.766724,-1.055716,-0.512624,-1.45818,-0.373777,-0.414456,-1.757175,-0.345437,0.46805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.980089,1.661426,-2.099485,0.253472,-0.425852,-0.399765,0.753194,0.158166,-0.451738,-1.377199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.574916,1.735419,-1.653709,0.445359,-2.20633,-0.469522,-0.480454,-4.054102,-0.399661,-1.332689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.459152,1.80372,-2.240829,3.369523,-2.006964,0.029716,-0.597727,-2.402018,-0.392682,1.807477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


$$ y(songs) = popularity*(Weight = -1.343388) + acousticness* .... $$
y = [x][w]

In [145]:
# re-inializing the tranformed data set (one hot endcoded) to include original name and artists for song prediction
df["name"] = data["track_name"]
df["artist"] = data["artist_name"]

# df_2 only has the clategories we utilized with one hot encoding
df_2 = df.drop(["artist","name"],1)
df_2.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,Country,Reggaeton,Ska,Indie,Rock,Soul,Soundtrack,Jazz,World,Movie
0,-1.343388,1.806566,-1.311222,2.179132,-1.343051,-0.468048,-0.600265,-0.830223,-0.36369,-1.023107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.517034,1.766724,-1.055716,-0.512624,-1.45818,-0.373777,-0.414456,-1.757175,-0.345437,0.46805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.980089,1.661426,-2.099485,0.253472,-0.425852,-0.399765,0.753194,0.158166,-0.451738,-1.377199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.574916,1.735419,-1.653709,0.445359,-2.20633,-0.469522,-0.480454,-4.054102,-0.399661,-1.332689,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.459152,1.80372,-2.240829,3.369523,-2.006964,0.029716,-0.597727,-2.402018,-0.392682,1.807477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Song finder

In [146]:
def sim_track_find(word,artist):
    a = 0
    b = 0
    song = []
    indexes = []
    for i in data["track_name"]:
        if word.lower() in i.lower() and artist.lower() in data["artist_name"][a].lower():
            song.append(df_2[a:a+1].values)
            indexes.append(a)
            b+=1
        a+=1
    if b == 0:
        print("Nothing found. Please try something else :)")
        return 0
        
    return song[0][0], indexes[0]
# grabing the normalized, one hot encoded prediction weights for this song
# y = popularity * weight = 2.41893446 + x * w + ...

In [None]:
word = "ocean eyes"
artist = "billie"

sim_track_find(word,artist)
arr = sim_track_find(word,artist)[0]
arr = pd.DataFrame(arr, columns = ["Song Similarity Features"])
# arr
# return that songs features as an array and the index of that song.

## Cosine Distance

In [None]:
X_cosine = []
Y_cosine = []
def similar_tracks_cosine(number,song = "",artist = ""):

    if (sim_track_find(song,artist) == 0):
        return 0
    else:
        # our x values = features ( y = x * w)
        x=sim_track_find(song,artist)[0]
        
        # index of the row where the song exists in data = 12802
        index = sim_track_find(song,artist)[1]
    p = []
    count=0
    for i in df_2.values:
        # find the cosine distance between each training set (x values) and the features/weights
        # cosine euclidean jaccard
        p.append([distance.cosine(x,i),count])
        count+=1
    # smallest to largest distances
    p.sort()
    song_names = df["name"]
    artist_names = df["artist"]
    print("\nSimilar songs to ",song_names[index]," by ", artist_names[index],"\n")
#     songs = []
#     artists = []
    for i in range(1,number+1):
#         print("Statistical Score: ", p[i][1])
#         print("Feature Sets: ", p[i][0])
        
        X_cosine.append(p[i][0])
        Y_cosine.append(p[i][1])
#         songs.append(song_names[p[i][1]])
#         artists.append(artist_names[p[i][1]])
#         print(i,"- ",song_names[p[i][1]],", ",artist_names[p[i][1]])
#     data = np.array([Y_cosine, songs, artists])
#     table = pd.DataFrame(data)
#     print(table)

# song = "ocean eyes"
# artist = "billie"
# num = 5

similar_tracks_cosine(num,song,artist)
    
X_euclidean = []
Y_euclidean = []
def similar_tracks_euclidean(number,song = "",artist = ""):

    if (sim_track_find(song,artist) == 0):
        return 0
    else:
        # our x values = features ( y = x * w)
        x=sim_track_find(song,artist)[0]
        
        # index of the row where the song exists in data = 12802
        index = sim_track_find(song,artist)[1]
    p = []
    count=0
    for i in df_2.values:
        # find the cosine distance between each training set (x values) and the features/weights
        # cosine euclidean jaccard
        p.append([distance.euclidean(x,i),count])
        count+=1
    # smallest to largest distances
    p.sort()
    song_names = df["name"]
    artist_names = df["artist"]
    print("\nSimilar songs to ",song_names[index]," by ", artist_names[index],"\n")
    for i in range(1,number+1):
#         print("Statistical Score: ", p[i][1])
#         print("Feature Sets: ", p[i][0])
        X_euclidean.append(p[i][0])
        Y_euclidean.append(p[i][1])
#         print(i,"- ",song_names[p[i][1]],", ",artist_names[p[i][1]])
#         print()

X_jaccard = []
Y_jaccard = []
def similar_tracks_jaccard(number,song = "",artist = ""):

    if (sim_track_find(song,artist) == 0):
        return 0
    else:
        # our x values = features ( y = x * w)
        x=sim_track_find(song,artist)[0]
        
        # index of the row where the song exists in data = 12802
        index = sim_track_find(song,artist)[1]
    p = []
    count=0
    for i in df_2.values:
        # find the cosine distance between each training set (x values) and the features/weights
        # cosine euclidean jaccard
        p.append([distance.jaccard(x,i),count])
        count+=1
    # smallest to largest distances
    p.sort()
    song_names = df["name"]
    artist_names = df["artist"]
    print("\nSimilar songs to ",song_names[index]," by ", artist_names[index],"\n")
    for i in range(1,number+1):
#         print("Statistical Score: ", p[i][1])
#         print("Feature Sets: ", p[i][0])
        X_jaccard.append(p[i][0])
        Y_jaccard.append(p[i][1])
#         print(i,"- ",song_names[p[i][1]],", ",artist_names[p[i][1]])
#         print()

In [None]:
song = "ocean eyes"
artist = "billie"
num = 5

similar_tracks_cosine(num,song,artist)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

song = "ocean eyes"
artist = "billie"
num = 5

similar_tracks_cosine(num,song,artist)
x = X_cosine
y = Y_cosine
# print(X_cosine)
plt.scatter(x, y)
plt.title("Cosine Similarity")
plt.xlabel("Statistical Score")
plt.ylabel("Feature Sets")
plt.show()

The statistical score allows us to rank each individual song between the similarity of the original song.
The closer the point is to zero on the x axis, the more similar the song is to the original song.
The y axis represents the prominent feature sets ranking which in this case, for cosine, utilizes it as a tie-breaker between songs.

In [None]:
similar_tracks_euclidean(num,song,artist)
x = X_euclidean
y = Y_euclidean
plt.scatter(x, y)
# print(X_euclidean)
plt.title("Euclidean Distance")
plt.xlabel("Statistical Score")
plt.ylabel("Feature Sets")
plt.show()

In [None]:
similar_tracks_jaccard(num,song,artist)
x = X_jaccard
y = Y_jaccard
# print(X_jaccard)
plt.scatter(x, y)
plt.title("Jaccard")
plt.xlabel("Statistical Score")
plt.ylabel("Feature Sets")
plt.show()

# Jaccard
    Similar songs to  ocean eyes  by  Billie Eilish 

    Statistical Score:  12756
    1 -  goosebumps ,  Travis Scott

    Statistical Score:  12759
    2 -  Young Dumb & Broke ,  Khalid

    Statistical Score:  12761
    3 -  Love Lies (with Normani) ,  Khalid

    Statistical Score:  12762
    4 -  Mine ,  Bazzi

    Statistical Score:  12763
    5 -  Ric Flair Drip (& Metro Boomin) ,  Offset

# Euclidean
    Similar songs to  ocean eyes  by  Billie Eilish 

    Statistical Score:  13821
    1 -  I Can't Fall in Love Without You ,  Zara Larsson

    Statistical Score:  12746
    2 -  idontwannabeyouanymore ,  Billie Eilish

    Statistical Score:  13004
    3 -  Let It Go ,  James Bay

    Statistical Score:  12957
    4 -  Little Do You Know ,  Alex & Sierra

    Statistical Score:  12976
    5 -  Someone Like You ,  Adele

# Cosine
    Similar songs to  ocean eyes  by  Billie Eilish 

    Statistical Score:  33596
    1 -  Abrázame Muy Fuerte ,  Juan Gabriel

    Statistical Score:  13821
    2 -  I Can't Fall in Love Without You ,  Zara Larsson

    Statistical Score:  12746
    3 -  idontwannabeyouanymore ,  Billie Eilish

    Statistical Score:  13751
    4 -  Make It To Me ,  Sam Smith

    Statistical Score:  13004
    5 -  Let It Go ,  James Bay

In [None]:
from math import sqrt
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={
    'user': {
        'Lady in the Water': 5,
        'Snakes on a Plane': 4,
        'Just My Luck': 3,
        'Superman Returns': 4,
        'You, Me and Dupree': 3.9,
        'The Night Listener': 4.1
    },
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5
    },
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0,
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1

    if len(si)==0: return 0
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2) for item in si])

    return 1/(1+sqrt(sum_of_squares))

def sim_pearson(prefs,p1,p2):
  # Get the list of mutually rated items
  si={}
  for item in prefs[p1]:
    if item in prefs[p2]: si[item]=1

  # Find the number of elements
  n=len(si)

  # if they have no ratings in common, return 0
  if n==0: return 0

  # Add up all the preferences
  sum1=sum([prefs[p1][it] for it in si])
  sum2=sum([prefs[p2][it] for it in si])

  # Sum up the squares
  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])

  # Sum up the products
  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])

  # Calculate Pearson score
  num=pSum-(sum1*sum2/n)
  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  if den==0: return 0
  r=num/den

  return r

val = sim_distance(critics, 'Lisa Rose', 'Gene Seymour')
print(val)

val1 = sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')
print(val1)

# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
  scores=[(similarity(prefs,person,other),other)
                  for other in prefs if other!=person]

  # Sort the list so the highest scores appear at the top
  scores.sort(  )
  scores.reverse(  )
  return scores[0:n]

val2 = topMatches(critics, 'user', n=3)
print(val2)

In [None]:
# from math import sqrt
# # A dictionary of movie critics and their ratings of a small
# # set of movies
# critics={
#     'user': {
#         'Lady in the Water': 5,
#         'Snakes on a Plane': 4,
#         'Just My Luck': 3,
#         'Superman Returns': 4,
#         'You, Me and Dupree': 3.9,
#         'The Night Listener': 4.1
#     }
# }

# def sim_pearson(prefs,p1,p2):
#   # Get the list of mutually rated items
#   si={}
#   for item in prefs[p1]:
#     if item in prefs[p2]: si[item]=1

#   # Find the number of elements
#   n=len(si)

#   # if they have no ratings in common, return 0
#   if n==0: return 0

#   # Add up all the preferences
#   sum1=sum([prefs[p1][it] for it in si])
#   sum2=sum([prefs[p2][it] for it in si])

#   # Sum up the squares
#   sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
#   sum2Sq=sum([pow(prefs[p2][it],2) for it in si])

#   # Sum up the products
#   pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])

#   # Calculate Pearson score
#   num=pSum-(sum1*sum2/n)
#   den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
#   if den==0: return 0
#   r=num/den

#   return r

# val = sim_distance(critics, 'Lisa Rose', 'Gene Seymour')
# print(val)

# val1 = sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')
# print(val1)

# # Returns the best matches for person from the prefs dictionary.
# # Number of results and similarity function are optional params.
# def topMatches(prefs,person,n=5,similarity=sim_pearson):
#   scores=[(similarity(prefs,person,other),other)
#                   for other in prefs if other!=person]

#   # Sort the list so the highest scores appear at the top
#   scores.sort(  )
#   scores.reverse(  )
#   return scores[0:n]

# val2 = topMatches(critics, 'user', n=3)
# print(val2)