## Import pandas and read files 

In [537]:
import pandas as pd 
pd.set_option("display.max_rows", 6)

# read the csv files and make them DataFrames
taylorData = pd.read_csv("taylor_swift_spotify.csv")
spotifyData = pd.read_csv("spotify-tracks-dataset.csv") 

Since Taylor Swift often releases the same titled tracks under different albums. We will get rid of all duplicates to start with a cleaner dataset. 

In [538]:
# drop any duplicate tracks 
# (Taylor sometimes has the same tracks in different albums) 
taylorData = taylorData.drop_duplicates(subset = ["name"], keep="last")
taylorData = taylorData.reset_index(drop=True)

## Find spotify track data 
To input your desired song, use the spotify track link. To also choose the number of recommendations you want, declare that variable. 

In [539]:
spotifyLink = "https://open.spotify.com/track/3hUxzQpSfdDqwM3ZTFQY0K?si=42d0177e14ef4cfd"
numberOfRecs = 3

The link will now be used to find the track's id, as well as where the track is located within the spotify database. 

In [540]:
def getTrackId(link): 
    # uses a spotify link to access the spotify id of a track 
    arr = link.split("/") 
    id = arr[len(arr)-1] # get the last element 
    arr = id.split("?")
    id = arr[0]
    return id

def findSong(link, dataset=spotifyData): 
    # returns a dataframe with the desired song title
    id = getTrackId(link) # get the id
    song = dataset.loc[dataset["track_id"] == id]
    return song.reset_index(drop=True)


song = findSong(spotifyLink)
print(song.to_string())

   Unnamed: 0                track_id       artists album_name track_name  popularity  duration_ms  explicit  danceability  energy  key  loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence   tempo  time_signature track_genre
0       81556  3hUxzQpSfdDqwM3ZTFQY0K  Taylor Swift   folklore     august          86       261922     False         0.532   0.623    5    -9.208     1       0.0331         0.538          0.000073    0.0925    0.403  89.937               4         pop


## Get data from track 
Assuming the track has been found, a dictionary of all the categories with number values will be created. These categories include: 
- danceability
- loudness 
- speechiness 
- acousticness
- instrumentalness 
- valence
- liveness 
- tempo 

In [541]:
# These are the attributes that will be used in recommending a song
categories = {"danceability", 
              "energy", 
              "loudness", 
              "speechiness", 
              "acousticness", 
              "instrumentalness", 
              "valence",
              "liveness",
              "tempo"}

def buildDict(df, categories): 
    # builds a dictionary with all the given categories 
    if (len(df) < 1): 
        return {}
    else: 
        dict = {} 
        row = df.index.values.tolist()[0]
        for cat in categories: 
            dict.update({cat : df[cat][row]})
        return dict

dict = buildDict(song, categories)
print(dict)

{'loudness': -9.208, 'acousticness': 0.538, 'tempo': 89.937, 'speechiness': 0.0331, 'energy': 0.623, 'danceability': 0.532, 'liveness': 0.0925, 'instrumentalness': 7.28e-05, 'valence': 0.403}


## Generating song similarity 

To make a recommendation, we must find songs that are similar to the chosen track. Therefore, we will look through the Taylor Swift discography dataset and find out how it compares to the inputted track. 

The similarity between tracks in this project will be defined by the mean percent error between the user's track and a given Taylor Swift song. This will be represented by the "match sum." The lower the match sum, the more similar the two tracks are. 

In [542]:
def calculateAvgError(dict, row):
    sum = 0
    for key in dict.keys(): 
        if (row[key] == 0):
            delta = 1
        else:  
            delta = (dict[key] - row[key])/row[key]
            delta = abs(delta)

        sum += delta
    return sum/len(dict)
taylorDataMatched = taylorData.copy()

# create a column with the "match sum":
# a sum of all the distances between the inputted song's attributes and Taylor's songs
taylorDataMatched["match_sum"] = taylorDataMatched.apply(lambda row: calculateAvgError(dict, row), axis=1) 

## Sorting and cleaning 
To find the top "matched" Taylor Swift songs, we will sort the dataset in ascending order based on the match sum. 

In [543]:
# sort the data based on the match sum 
taylorDataMatched = taylorDataMatched.sort_values("match_sum")

If the user inputs a Taylor Swift song, it wouldn't be interesting if the program recommended that very same song. Therefore, we get rid of the suggestion if they're the same. 

In [544]:
firstTSRec = taylorDataMatched["name"][0]
songTitle = song["track_name"][0] # assuming the chosen track exists in the spotify dataset

if (firstTSRec == songTitle): 
    taylorDataMatched = taylorDataMatched.drop(index=0) # drop the row with that track
    
taylorDataMatched = taylorDataMatched.reset_index(drop=True)
taylorDataMatched

Unnamed: 0.1,Unnamed: 0,name,album,release_date,track_number,id,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,duration_ms,match_sum
0,267,august,folklore,2020-07-24,8,3hUxzQpSfdDqwM3ZTFQY0K,spotify:track:3hUxzQpSfdDqwM3ZTFQY0K,0.538000,0.532,0.623,0.000073,0.0925,-9.208,0.0331,89.937,0.403,88,261922,0.000000
1,197,‘tis the damn season,evermore,2020-12-11,4,7dW84mWkdWE5a6lFWxJCBG,spotify:track:7dW84mWkdWE5a6lFWxJCBG,0.735000,0.575,0.434,0.000066,0.1050,-8.193,0.0312,145.916,0.348,74,229840,0.192057
2,270,invisible string,folklore,2020-07-24,11,6VsvKPJ4xjVNKpI8VVZ3SV,spotify:track:6VsvKPJ4xjVNKpI8VVZ3SV,0.838000,0.653,0.452,0.000077,0.1080,-11.143,0.0550,83.433,0.450,77,252880,0.208620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,121,State Of Grace (Taylor's Version),Red (Taylor's Version),2021-11-12,1,6lzc0Al0zfZOIFsFvBS1ki,spotify:track:6lzc0Al0zfZOIFsFvBS1ki,0.000328,0.594,0.713,0.000000,0.1140,-5.314,0.0503,129.958,0.328,70,295413,182.474938
327,409,State Of Grace,Red,2012-10-22,1,786NsUYn4GGUf8AOt0SQhP,spotify:track:786NsUYn4GGUf8AOt0SQhP,0.000197,0.588,0.825,0.001380,0.0885,-5.882,0.0328,129.968,0.397,40,295186,303.577191
328,163,Change (Taylor’s Version),Fearless (Taylor's Version),2021-04-09,13,3ExweHKZF9B752DPQByRVT,spotify:track:3ExweHKZF9B752DPQByRVT,0.000191,0.499,0.815,0.000000,0.1810,-4.063,0.0341,95.999,0.344,64,279359,313.230542


## Final Results
The final recommendations based on your inputted song: 

In [545]:
taylorDataMatched.head(numberOfRecs)

Unnamed: 0.1,Unnamed: 0,name,album,release_date,track_number,id,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,duration_ms,match_sum
0,267,august,folklore,2020-07-24,8,3hUxzQpSfdDqwM3ZTFQY0K,spotify:track:3hUxzQpSfdDqwM3ZTFQY0K,0.538,0.532,0.623,7.3e-05,0.0925,-9.208,0.0331,89.937,0.403,88,261922,0.0
1,197,‘tis the damn season,evermore,2020-12-11,4,7dW84mWkdWE5a6lFWxJCBG,spotify:track:7dW84mWkdWE5a6lFWxJCBG,0.735,0.575,0.434,6.6e-05,0.105,-8.193,0.0312,145.916,0.348,74,229840,0.192057
2,270,invisible string,folklore,2020-07-24,11,6VsvKPJ4xjVNKpI8VVZ3SV,spotify:track:6VsvKPJ4xjVNKpI8VVZ3SV,0.838,0.653,0.452,7.7e-05,0.108,-11.143,0.055,83.433,0.45,77,252880,0.20862
