# Spotify Song-Feature Scraper
Run this script to build a database of spotify songs with their features to analyse similarities.
Due to the imposed limit by the Spotify API this code will only produce a list of 2000 songs, though running it again the following day will add another 2000 rows etc.

In [11]:
from dotenv import load_dotenv
from requests import post, get
import numpy as np
import pandas as pd
import random as rnd
import os
import main

In [12]:
current_iteration = len(pd.read_csv("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify_Track_Features.csv"))

In [13]:
artist_df = pd.read_csv("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Artist_Data.csv")
artist_ids = artist_df.iloc[:,1].values
artist_df = artist_df.loc[artist_df.index.repeat(10)]

In [14]:
track_df = pd.read_csv("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Artist_Songs.csv")
track_id = track_df.iloc[:,1].values

The following will provide the features of the next 1000 songs in the track dataframe.

In [15]:
vectorized_feature_retriever = np.vectorize(main.get_track_features)
max_iteration = min(current_iteration+1000, 10000)
feature_array = vectorized_feature_retriever(main.token, track_id[current_iteration:max_iteration])

In [16]:
def create_feature_df(feature_array):
    df = pd.DataFrame.from_dict(pd.json_normalize(feature_array), orient='columns')
    return df
feature_df = create_feature_df(feature_array)

In [17]:
def create_spotify_df(current_iteration, artists, tracks, features):

    artist_df['Index'] = np.arange(0,len(artist_df))
    track_df['Index'] = np.arange(0,len(track_df))
    feature_df['Index'] = np.arange(current_iteration,len(feature_df)+current_iteration)

    artists_songs = pd.merge(artist_df, track_df, on="Index")
    database = pd.merge(artists_songs[current_iteration:], feature_df, on="Index")
    database.rename(columns={"Name": "Artist", "Spotify ID_y": "Artist ID"}, inplace=True)
    
    return database  
spotify_df = create_spotify_df(current_iteration, artist_df, track_df, feature_df)

In [18]:
folder = os.listdir("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/")
current_file_version = len(folder)
if os.path.isfile(rf"/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/Spotify_Database{current_file_version}.csv"):
    spotify_df.to_csv(rf"/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/Spotify_Database{current_file_version + 1}.csv",
                       index=False)
else:
    spotify_df.to_csv(rf"/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/Spotify_Database{current_file_version}.csv",
                       index=False)

In [19]:
spotify_database = pd.DataFrame()
folder = os.listdir("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/")
folder.sort()
for i in folder:
    if i == ".DS_Store":
        continue
    data = pd.read_csv(rf"/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify/{i}")
    if 'error.status' in data.columns:
        data = data[data['error.status'] != 429]
    spotify_database = pd.concat([spotify_database, data])

spotify_database.drop_duplicates(inplace=True)
spotify_database.to_csv("/Users/pappalardodaniel/Desktop/VSCode/Abschlussarbeit/Data/Spotify_Track_Features.csv")

In [20]:
len(spotify_database)

9995

The final dataframe will be saved to the above working directory and named Spotify_Track_Features.