# Collection and Cleaning
### Research Question: Can we acurately determine whether a song feels happy or sad?

This notebook is designed to outline the tools and methods used to collect data from the [Spotify Web API](https://developer.spotify.com/documentation/web-api/reference/).

In order to get song information from Spotify we will need to make requests to their API. Rather than stumbling through the API to get the information, let's build a class designed to communicate with the API. This will help us get song features like: 'loudness', 'valence', 'key'.

In [2]:
import requests
import pandas as pd
import numpy as np
from time import sleep

In [3]:
# Spotify
#
# This class is used to communicate with the Spotify Api. It will cache query results
# to not spam the api.
class Spotify:
    
    def __init__(self, token, search_cache, recommendation_cache, audio_features_cache):
        self.search_cache = search_cache
        self.recommendation_cache = recommendation_cache
        self.audio_features_cache = audio_features_cache
        self.base_url = "https://api.spotify.com"
        self.default_headers = {
            "Authorization": "Bearer " + token
        }
    
    # Search 
    #   q: search query
    #   type: search type
    #   limit: limit number of results
    def search(self, q, type, limit):
        cache_key = q + type
        if not cache_key in self.search_cache:
            url = "/v1/search?q=" + q + "&type=" + type + "&limit=" + str(limit)
            self.search_cache[cache_key] = self.get(url)
        return self.search_cache[cache_key]
    
    # Recommend 
    #   track: spotify track id
    #   limit: limit number of results
    def recommend(self, track, limit):
        cache_key = track
        if not cache_key in self.recommendation_cache:
            url = "/v1/recommendations?seed_tracks=" + track + "&limit=" + str(limit)
            resp = self.get(url)
            if(str(resp) == "<Response [200]>"):
                self.recommendation_cache[cache_key] = resp
        return self.recommendation_cache[cache_key]

    # Audio Features 
    #   track: spotify track id
    def audio_features(self, track):
        cache_key = track
        if not cache_key in self.audio_features_cache:
            url = "/v1/audio-features/" + track
            self.audio_features_cache[cache_key] = self.get(url)
        return self.audio_features_cache[cache_key]
        
    def get(self, url):
        absUrl = self.base_url + url
        sleep(1)
        return requests.get(absUrl, headers=self.default_headers)

In [4]:
# Create the caches to use for spotify. 
# Only re-run this cell if you want to reset the cache.

SEARCH_CACHE = {}
RECOMMENDATION_CACHE = {}
AUDIO_FEATURES_CACHE = {}

In [49]:
SEARCH_CACHE

{'Human Sadness The Voidztrack': <Response [200]>,
 'Trailer Trash Modest Mousetrack': <Response [200]>,
 'Heroes David Bowietrack': <Response [200]>,
 'I’m so lonesome I could cry Hank Williamstrack': <Response [200]>,
 'Pale Blue Eyes The Velvet Undergroundtrack': <Response [200]>}

In [5]:
# Get the Bearer access token from spotify API

client_id = CLIENT_ID
client_secret = CLIENT_SECRET

grant_type = 'client_credentials'

#Request based on Client Credentials Flow from https://developer.spotify.com/web-api/authorization-guide/

#Request body parameter: grant_type Value: Required. Set it to client_credentials
body_params = {'grant_type' : grant_type}

url='https://accounts.spotify.com/api/token'
resp=requests.post(url, data=body_params, auth = (client_id, client_secret)) 
token = resp.json()["access_token"]

In [6]:
# Create the spotify instance.
spotify = Spotify(token, SEARCH_CACHE, RECOMMENDATION_CACHE, AUDIO_FEATURES_CACHE)

Let's build some helper functions to navigate the API through the Spotify class. 

In [7]:
"""
This function is used to filter the original set of data based
based on if they return a search result. Provide a list of keys 
to use as the search query. 

df: dataframe
keys: list of values in the dataframe to use as the search query
"""
def filter_by_search(df, keys):
    filtered = []
    for ix, row in df.iterrows():
        stack = [row[x] for x in keys]
        items = []
        while len(items) == 0 and len(stack) > 0:
            q = " ".join(stack)
            json = spotify.search(q, "track", 1).json()
            tracks = json["tracks"]
            items = tracks["items"]
            stack = stack[:-1]

        if len(items) > 0:
            filtered.append(row)
    return pd.DataFrame(filtered)

In [8]:
"""
Appends search results as new columns to the dataframe

df: dataframe
keys: list of values in the dataframe to use as the search query
"""
def get_search_results(df, keys):
    new_cols = {}
    for ix, row in df.iterrows():
        stack = [row[x] for x in keys]
        items = []
        while len(items) == 0 and len(stack) > 0:
            q = " ".join(stack)
            json = spotify.search(q, "track", 1).json()
            tracks = json["tracks"]
            items = tracks["items"]
            stack = stack[:-1]
        
        result = items[0]
        for key in result.keys():
            if not key in new_cols:
                new_cols[key] = []
            new_cols[key].append(result[key])

    for col in new_cols.keys():
        df[col] = new_cols[col]

    return df

In [9]:
"""
Appends a given number of recommendations for each row in
the dataframe

df: dataframe
limit: number of recommendataions for each row
"""
def get_recommendataion_results(df, limit):
    for ix, row in df.iterrows():
        tracks = spotify.recommend(row["id"], limit).json()["tracks"]
        if len(tracks) > 0:
            result = tracks[0]
            result["parent"] = row["id"]
            result["Name"] = row["Name"]
            result["Title"] = result["name"]
            result["Artist"] = result["artists"][0]['name']
            result["Sad/Happy"] = row["Sad/Happy"]
            df = df.append(result, ignore_index=True)
    return df

In [10]:
"""
Append the audio features for each track as new columns
on the dataframe

df: dataframe
"""
def get_audio_features(df):
    new_cols = {}
    for ix, row in df.iterrows():
        json = spotify.audio_features(row["id"]).json()
        for key in json.keys():
            if not key in new_cols:
                new_cols[key] = []
            new_cols[key].append(json[key])
    for col in new_cols.keys():
        df[col] = new_cols[col]

    return df

In [11]:
"""
This gets the expanded set of data to workwith.

1) Filters by searchable tracks
2) Gets search results for each track
3) Adds more data by getting n number of recommendataions for each track
4) Adds the audio feature data for each track
"""
def get_data(filename, n):
    df = pd.read_csv(filename)
    filtered = filter_by_search(df, ["Title", "Artist"])
    filtered["parent"] = [""] * len(filtered)
    s_df = get_search_results(filtered, ["Title", "Artist"])
    r_df = get_recommendataion_results(s_df, n)
    a_df = get_audio_features(r_df)
    return a_df

In [13]:
# Load the data with an additional 2 recommendataions per track
data = get_data("https://raw.githubusercontent.com/jallen182/Data301/master/songs6.csv", 2)
data.head()

Unnamed: 0,Name,Title,Artist,Sad/Happy,parent,album,artists,available_markets,disc_number,duration_ms,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_href,analysis_url,time_signature
0,Stephanie,Distance,Cake,H,,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AR, BE, BG, BH, BO, BR, CA, CL, CO, C...",1,179653,...,0,0.0891,0.00796,0.00219,0.0558,0.888,183.048,https://api.spotify.com/v1/tracks/0xMEF2WiqKWT...,https://api.spotify.com/v1/audio-analysis/0xME...,4
1,Stephanie,Let's Stay Together,Al Green,H,,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AR, AT, BE, BG, BH, BO, BR, CA, CH, C...",1,199396,...,0,0.0522,0.57,0.0115,0.0542,0.507,101.985,https://api.spotify.com/v1/tracks/63xdwScd1Ai1...,https://api.spotify.com/v1/audio-analysis/63xd...,4
2,Stephanie,Blood on your bootheels,Caroline Rose,H,,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AT, BE, BG, CA, CH, CY, CZ, DE, DK, EE, E...",1,195320,...,1,0.125,0.144,0.0,0.0932,0.86,160.176,https://api.spotify.com/v1/tracks/5qfPr1jisW3b...,https://api.spotify.com/v1/audio-analysis/5qfP...,4
3,Stephanie,FM,Steely Dan,H,,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[CA, MX, US]",1,290160,...,0,0.0295,0.135,0.00194,0.0946,0.717,109.383,https://api.spotify.com/v1/tracks/0zw5bUUdOxVL...,https://api.spotify.com/v1/audio-analysis/0zw5...,4
4,Stephanie,Valley of the silver moon,Jonathon Wilson,S,,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[CA, MX, US]",1,632293,...,1,0.0302,0.543,0.789,0.0834,0.134,107.834,https://api.spotify.com/v1/tracks/30pUYzSBJ2tB...,https://api.spotify.com/v1/audio-analysis/30pU...,4


In [None]:
#List of songs without emotion label to test in our machine learning section.
test_data = get_data("https://raw.githubusercontent.com/jallen182/Data301/master/songs_test%202.csv",1)

In [None]:
test_data.to_csv("test_data.csv")

In [None]:
#Remove whitespace issues from columns
data["Name"] = data["Name"].str.strip()
data["Sad/Happy"] = data["Sad/Happy"].str.strip()

#Finally, export the clean data to a csv file for later use
data.to_csv("clean_data3.csv")