## Import pandas and read files 

In [None]:
import pandas as pd 
pd.set_option("display.max_rows", 6)

# read the csv files and make them DataFrames
taylorData = pd.read_csv("Datasets/taylor_swift_spotify.csv")
spotifyData = pd.read_csv("Datasets/spotify-tracks-dataset.csv") 

Since Taylor Swift often releases the same titled tracks under different albums. We will get rid of all duplicates to start with a cleaner dataset. 

In [None]:
# drop any duplicate tracks 
# (Taylor sometimes has the same tracks in different albums) 
taylorData = taylorData.drop_duplicates(subset = ["name"], keep="last")
taylorData = taylorData.reset_index(drop=True)

## Find Spotify Track Data 
To input your desired song, use the spotify track link. To also choose the number of recommendations you want, declare that variable. 

### Link and Number of Recommendations

In [None]:
# replace this spotify link
spotifyLink = "https://open.spotify.com/track/3hUxzQpSfdDqwM3ZTFQY0K?si=0a4b2162f6f54e95"

numberOfRecs = 3

The link will now be used to find the track's id, as well as where the track is located within the spotify database. 

In [None]:
def getTrackId(link): 
    # uses a spotify link to access the spotify id of a track 
    arr = link.split("/") 
    id = arr[len(arr)-1] # get the last section of the url 
    arr = id.split("?") # the id goes up til the question mark
    id = arr[0]
    return id

def findSong(link, dataset=spotifyData): 
    # returns a dataframe with the desired song title
    id = getTrackId(link) # get the id
    song = dataset.loc[dataset["track_id"] == id]
    return song.reset_index(drop=True)

# find the song's data
song = findSong(spotifyLink)
print(song.to_string())

## Get data from track 
Assuming the track has been found, a dictionary of all the categories with number values will be created. These categories include: 
- danceability
- energy
- loudness 
- speechiness 
- acousticness
- instrumentalness 
- valence
- liveness 
- tempo 

In [None]:
# These are the attributes that will be used in recommending a song
categories = {"danceability", 
              "energy", 
              "loudness", 
              "speechiness", 
              "acousticness", 
              "instrumentalness", 
              "valence",
              "liveness",
              "tempo"}

def buildDict(df, categories): 
    # builds a dictionary with all the given categories 
    if (len(df) < 1): 
        # if the dataframe is empty
        return {} 
    else: 
        dict = {} 
        row = df.index.values.tolist()[0] # only look at the first row
        for cat in categories: 
            dict.update({cat : df[cat][row]})
        return dict

dict = buildDict(song, categories)
print(dict)

## Generating song similarity 

To make a recommendation, we must find songs that are similar to the chosen track. Therefore, we will look through the Taylor Swift discography dataset and find out how it compares to the inputted track. 

The similarity between tracks in this project will be defined by the mean percent error between the user's track and a given Taylor Swift song. This will be represented by the "match." The lower the match sum, the more similar the two tracks are. 

In [None]:
# calculating the average percent difference between the inputted song and TS songs
def calculateAvgDiff(dict, row):
    sum = 0
    for key in dict.keys(): 
        if (row[key] == 0):
            diff = 1 # avoiding division by zero
        else:  
            diff = (dict[key] - row[key])/row[key] # percent difference
            diff = abs(diff)
        sum += diff
    return sum/len(dict)
taylorDataMatched = taylorData.copy()

# create a column with the "match":
# a sum of all the distances between the inputted song's attributes and Taylor's songs
taylorDataMatched["match"] = taylorDataMatched.apply(lambda row: calculateAvgDiff(dict, row), axis=1) 

## Sorting and cleaning 
To find the top "matched" Taylor Swift songs, we will sort the dataset in ascending order based on the match sum. 

In [None]:
# sort the data based on the match sum 
taylorDataMatched = taylorDataMatched.sort_values("match")
taylorDataMatched = taylorDataMatched.reset_index(drop=True)

If the user inputs a Taylor Swift song, it wouldn't be interesting if the program recommended that very same song. Therefore, we get rid of the suggestion if they're the same. 

In [None]:
firstTSRec = taylorDataMatched["name"][0]
songTitle = song["track_name"][0] # assuming the chosen track exists in the spotify dataset

# if the titles are the same 
if (firstTSRec == songTitle): 
    taylorDataMatched = taylorDataMatched.iloc[1:]

# fix indexing
taylorDataMatched = taylorDataMatched.reset_index(drop=True)
taylorDataMatched.head()

## Final Results
The final recommendations based on your inputted song: 

In [None]:
finalData = taylorDataMatched.head(numberOfRecs) 
finalData = finalData.filter(['name', 'album'])
finalData