In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import config
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import initializers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv2D,AveragePooling2D, BatchNormalization,Dropout,Flatten,Dense
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import seaborn as sns
from sklearn.metrics import pairwise_distances_argmin_min, pairwise_distances
plt.style.use('ggplot')
import matplotlib.cm as cm

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import io
import librosa
import librosa.display
import soundfile as sf
import glob


import pydub
from urllib.request import urlopen
import requests
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
from urllib.request import urlopen, Request


## Load in model

In [None]:
#load pretrained model
model = load_model('more_data_08-2.07.h5')

In [None]:
# take the last layer off the model so we can get to the latent features
new_model =model
new_model.layers.pop()
new_model_2 = Model(new_model.input, new_model.layers[-3].output)
model.summary()

## Process

After going through a number of different processes I realised the most memory efficent way of doing this process is creating the spectrogram and then immediately running it through the CNN without saving it down and then just store the latent features of each song. This avoids having to save loads of spectrograms that take up a load of memory. 

In [None]:
def get_spotify_quals(music_df):
    '''get spotify music qualities and put into
        dataframe then merge into main dataframe'''
    #get ids of songs so can search spotify
    id_list=list(music_df['id'].values)
    #earlier function that helps look for music quals
    quals = get_music_quals(id_list)
    #put the qualities as column headers
    df=pd.DataFrame(columns=list(quals[list(quals.keys())[0]][0].keys()))
    for ind, key in enumerate(quals.keys()):
    #iterate over songs and get the spotify qualities for each song into the df
        song=quals[key]
        try:
            df.loc[ind]=list(song[0].values())
        except:
            print(song)
    #merge the two dataframes together on id and return
    orig_and_spotify =pd.merge(music_df,df,on='id',how='outer')
    
    #drop columns we dont really need
    orig_and_spotify.drop(columns=['track_href','analysis_url','duration_ms','type','uri'],inplace=True)
    #turning some of the columns into ints so can be used
    #using try and except as there may be some NANs
    try:
        orig_and_spotify.key=orig_and_spotify.key.astype(int)
        orig_and_spotify['mode']=orig_and_spotify['mode'].astype(int)
        orig_and_spotify.time_signature=orig_and_spotify.time_signature.astype(int)
        return orig_and_spotify
    except:
        return orig_and_spotify
    

In [None]:


def get_songs_from_playlist_search(playlist=None,limit=10,artist_name='techno'):
    '''takes a search term such as techno, finds playlists with techno
        in the name and then returns information about the songs in
        the playlist. Can also take individual playlist.'''
    playlist_ids=[]
    song_ids = []
    playlists =[]
    song_ids=[]
    preview_urls=[]
    song_names=[]
    
    count=0
    if playlist !=None:
        playlist_ids.append(playlist)
    else:
        results = spotify.search(q=artist_name,limit=limit, type='playlist')
    #find playlists with specific search term
        for playlist_info in results['playlists']['items']:
            playlist_ids.append(playlist_info['id'])
    
    #add all the songs in the playlists to another list
    for playlist in playlist_ids:
        source_playlist=spotify.user_playlist(username,playlist)
        song_set=source_playlist['tracks']
        songs=song_set['items']
        
        while song_set['next']:
            song_set=spotify.next(song_set)
            for song in song_set['items']:
                songs.append(song)
        playlists.append(songs)
    
        
   
    #loop over the song lists and extract information
    for song_list in playlists:
        for track in song_list:
            
            try:
                if track['track']['id']==None:
                    pass
                else:
                    try:
                        song_ids.append(track['track']['id'])
                        song_names.append(track['track']['name'])
                        preview_urls.append(track['track']['preview_url'])

                    except:
                        print(count)
                        count+=1
            except:
                print(count)
                count+=1
    
    return (song_ids,song_names,preview_urls)

In [None]:
#keep having to rerun this as spotify API sometimes needs reloading
client_id= config.client_id
client_secret =config.client_secret

username = '1143043561'
auth = SpotifyClientCredentials(
client_id=client_id,
client_secret=client_secret
)


try:
    token = auth.get_access_token()
except:
    os.remove(f'.cache-{username}')
    token = auth.get_access_token()

#create spotify object
spotify = spotipy.Spotify(auth=token)

In [None]:
def spectrograms_then_latent(url_list, id_list, names_list, model):
    '''takes list of urls and turns the preview mp3 
        into a spectrogram and then uses a model to 
        extract latent features'''
    
    print(len(url_list))
    print(len(id_list))
    latent_array = np.zeros((len(url_list), 40))
    fin_ids=[]
    fin_names=[]

    count = 0
    for ind, url in enumerate(url_list):

        if url != None:
            try:
                mp3_url = url
                wav = io.BytesIO()
                with urlopen(mp3_url) as r:
                    r.seek = lambda *args: None  # allow pydub to call seek(0)
                    pydub.AudioSegment.from_file(r).export(wav, "wav")

                wav.seek(0)
                y, sr = librosa.load(wav)

                # mel-scaled power (energy-squared) spectrogram
                mel_spec = librosa.feature.melspectrogram(y,
                                                          sr=sr,
                                                          n_mels=128,
                                                          hop_length=1024,
                                                          n_fft=2048)
                # Convert to log scale (dB). We'll use the peak power as reference.
                log_mel_spec = librosa.amplitude_to_db(mel_spec, ref=np.max)
                #make dimensions of the array smaller
                log_mel_spec = np.resize(log_mel_spec, (128, 644))

                log_mel_spec_arr = log_mel_spec.reshape(
                    log_mel_spec.shape[0], log_mel_spec.shape[1], 1)
                pre_process = np.expand_dims(log_mel_spec_arr, axis=0)
                pre_process = pre_process / 255
                latent = model.predict(pre_process)
                latent_array[ind, :] = latent
                fin_ids.append(id_list[ind])
                fin_names.append(names_list[ind])
                
            except:
                count += 1
                
                fin_ids.append(None)
                fin_names.append(None)
    
    print(f'missing{count} songs')
    latent_df= pd.DataFrame(latent_array)
    #because the latent array was init with loads of 0s need to remove the rows with only 0s
    #as they will be the ones that the model could not process
    latent_df=latent_df.loc[~(latent_df==0).all(axis=1)]
    latent_df['song_names']= fin_names
    latent_df['id']= fin_ids
    
    return (latent_df)

In [None]:
new_df =pd.DataFrame()
search_terms =['soul','disco','nts radio','text records','nina kraviz','kaytranada','instrumental hip hop']
for ind,  term in enumerate(search_terms):
    #loop was taking too long so would time out
    client_id= config.client_id
    client_secret =config.client_secret

    username = '1143043561'
    auth = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
    )


    try:
        token = auth.get_access_token()
    except:
        os.remove(f'.cache-{username}')
        token = auth.get_access_token()

    #create spotify object
    spotify = spotipy.Spotify(auth=token)
    ids, names, urls =get_songs_from_playlist_search(artist_name=term)
    latent_df=spectrograms_then_latent(urls, ids, names, new_model_2)
    new_df=new_df.append(latent_df,ignore_index=True)
    

In [None]:
def get_music_quals(id_list):
    '''get music qualities such as tempo and time sig
        with a list of song ids'''
    qualities={}
    count=0
    for song in id_list:
        if type(song)==str:
            quals=spotify.audio_features(song)
            try:
                qualities[song]=quals
            except:
                print(count)
                count+=1
    return qualities

In [None]:
def get_spotify_quals(music_df):
    '''get spotify music qualities and put into
        dataframe then merge into main dataframe'''
    #get ids of songs so can search spotify
    id_list=list(music_df['id'].values)
    #earlier function that helps look for music quals
    quals = get_music_quals(id_list)
    #put the qualities as column headers
    df=pd.DataFrame(columns=list(quals[list(quals.keys())[0]][0].keys()))
    for ind, key in enumerate(quals.keys()):
    #iterate over songs and get the spotify qualities for each song into the df
        song=quals[key]
        try:
            df.loc[ind]=list(song[0].values())
        except:
            print(song)
    #merge the two dataframes together on id and return
    orig_and_spotify =pd.merge(music_df,df,on='id',how='outer')
    
    #drop columns we dont really need
    orig_and_spotify.drop(columns=['track_href','analysis_url','duration_ms','type','uri'],inplace=True)
    #turning some of the columns into ints so can be used
    #using try and except as there may be some NANs
    try:
        orig_and_spotify.key=orig_and_spotify.key.astype(int)
        orig_and_spotify['mode']=orig_and_spotify['mode'].astype(int)
        orig_and_spotify.time_signature=orig_and_spotify.time_signature.astype(int)
        return orig_and_spotify
    except:
        return orig_and_spotify

In [None]:
new_df_with_spotify=get_spotify_quals(new_df)

In [None]:
new_df_with_spotify.dropna(inplace=True)
#had to do the below seperately because i wanted to check what had na values
new_df_with_spotify.key=new_df_with_spotify.key.astype(int)
new_df_with_spotify['mode']=new_df_with_spotify['mode'].astype(int)
new_df_with_spotify.time_signature=new_df_with_spotify.time_signature.astype(int)
new_df_with_spotify.columns=cols


In [None]:
#putting into main dataframe
music_values=pd.concat([music_values,new_df_with_spotify], ignore_index=True)

In [None]:
#dropping duplicates and saving to csv
music_values.drop_duplicates(subset='id',inplace=True)
music_values.to_csv('music_values.csv',index=False)