### Import

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import spotipy
import json
from tqdm import tqd
# from spotipy.oauth2 import SpotifyClientCredentials
from scipy.stats import multivariate_normal
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from mpl_toolkits import mplot3d

### Fonts

In [3]:
csfont = {'fontname':'Georgia'}
hfont = {'fontname':'Helvetica'}

### Get all the JSON files from the spotify data

In [4]:
data1 = pd.read_csv('../../data/spotify/SpotifyAudioFeaturesApril2019.csv')
data2 = pd.read_csv('../../data/spotify/SpotifyAudioFeaturesNov2018.csv')
data = pd.concat((data1,data2));

### Get all track IDs

In [6]:
track_ids = data['track_id'].values
query = []
artist_ids = []
indices = np.linspace(0,len(track_ids),len(track_ids)-1).astype(int)
indices = indices[::50]
for i in range(len(indices)):
    si = indices[i]
    ei = si + 50
    query = track_ids[si:ei].tolist()
    results = spotify.tracks(query)
    results = results['tracks']
    with open('../../data/spotify/track_artists.csv','a') as f:
        for t,track in enumerate(results):
            artist_id = track['artists'][0]['id']
            f.write(str(i) + ',' + query[t] + ',' + artist_id + '\n')

### Get Unique Artist IDs

In [7]:
df = pd.read_csv('../../data/spotify/track_artists.csv',header=None)
artist_ids = df[2].values
artist_ids = np.array(np.unique(artist_ids))

### Run a query to get all information on an artistID

In [8]:
query = []
indices = np.linspace(0,len(artistids),len(artistids)-1).astype(int)
indices = indices[::50]
for i in tqdm(range(len(indices))):
    si = indices[i]
    ei = si + 50
    query = artist_ids[si:ei]
    results = spotify.artists(query)
    results = results['artists']
    with open('../../data/spotify/artist_genres.csv','a') as f:
        for t,artist in enumerate(results):
            genres = '@'.join(artist['genres'])
            f.write(str(i) + ',' + query[t] + ',' + genres + '\n')

### Join Datasets

In [9]:
genre_df = pd.read_csv('../../data/spotify/artist_genres.csv',header=None)
genre_df.columns = ['id','artist_id','genre_string']
genre_df = genre_df.fillna('')
track_df = pd.read_csv('../../data/spotify/track_artists.csv',header=None)
track_df.columns = ['0','track_id','artist_id']
df = pd.merge(genre_df, track_df, on=['artist_id','artist_id'], how="outer", indicator=False)
df = pd.merge(df, data, on=['track_id','track_id'], how="inner", indicator=False)
df = df.drop_duplicates(subset=['track_id'])

### Genre Keywords

In [17]:
genre_keywords = ['house','country','rock','pop','blues','indie',
                  'alternative','trap','metal','soul',
                  'k-pop','classical','dance','edm','hip','rap','jazz','edm']

### Create High Level Genre Labeling

In [20]:
A = np.zeros((df.shape[0],1+len(genre_keywords))).astype(str)
genre_strings = df['genre_string'].values
trackids_strings = df['track_id'].values
for m,mystring in tqdm(enumerate(genre_strings)):
    mystring = str(mystring)
    idstring = trackids_strings[m]
    A[m,0] = idstring
    for k,keyword in enumerate(genre_keywords):
        if keyword in mystring.lower():
            A[m,k+1] = 1

130989it [00:00, 200485.14it/s]


### Join Dataframes 

In [21]:
dfA = pd.DataFrame(A)
dfA.columns = ['track_id'] + genre_keywords
newdf = pd.merge(df, dfA, on=['track_id','track_id'], how="inner", indicator=False)
newdf = newdf.drop(labels=['track_id','artist_id','id','0','duration_ms'],axis=1)
for genre_keyword in genre_keywords:
    newdf[genre_keyword] = newdf[genre_keyword].astype(float)

### Clean up and remove songs without any genre

In [22]:
A = A[:,1:].astype(float)
sumA = np.sum(A,axis=1)
idx = np.where(sumA==1)[0]
newdf = newdf.iloc[idx,:]

### Store

In [23]:
newdf.to_csv('../../data/spotify/df.csv',index=False)