In [1]:
import numpy as np
import pandas as pd
import json
import os
import random
import string
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

pd.set_option("max_rows", 10)

In [2]:
# Reading Spotify web API credentials from settings.env hidden file

with open('settings.env') as f:
    env_vars = json.loads(f.read())

# Set environment variables
os.environ['SPOTIPY_CLIENT_ID'] = env_vars['SPOTIPY_CLIENT_ID']
os.environ['SPOTIPY_CLIENT_SECRET'] = env_vars['SPOTIPY_CLIENT_SECRET']

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [36]:
# Getting 10,000 pseudo-random songs to balance the hit/not-hit songs datasets
# Year range same as selected for hit songs: 2000-2020

track_id_list = []

for i in range(0,10000):
    
    wildcard = [random.choice(string.ascii_letters + string.digits) + '%25',
                '%25' + random.choice(string.ascii_letters + string.digits),
                '%25' + random.choice(string.ascii_letters + string.digits) + '%25']
    
    random_char = random.choice(wildcard)
    
    search = sp.search(q=random_char + ' year:2000-2020',
                   type='track',
                   market='US', 
                   offset=random.randint(0,1990))
    
    track_id = search['tracks']['items'][0]['id']
    if track_id is None:
        track_id_list.append(np.nan)
    else:
        track_id_list.append(track_id)

IndexError: list index out of range

In [37]:
len(track_id_list)

3234

In [38]:
# Spotify capped at 3234 songs
# Rinse and repeat, adding more tracks to track_id_list

#track_id_list = []

for i in range(0,10000):
    
    wildcard = [random.choice(string.ascii_letters + string.digits) + '%25',
                '%25' + random.choice(string.ascii_letters + string.digits),
                '%25' + random.choice(string.ascii_letters + string.digits) + '%25']
    
    random_char = random.choice(wildcard)
    
    search = sp.search(q=random_char + ' year:2000-2020',
                   type='track',
                   market='US', 
                   offset=random.randint(0,1990))
    
    track_id = search['tracks']['items'][0]['id']
    if track_id is None:
        track_id_list.append(np.nan)
    else:
        track_id_list.append(track_id)

In [39]:
len(track_id_list)

13234

In [40]:
df = pd.DataFrame()
df['track_id'] = track_id_list

In [41]:
df

Unnamed: 0,track_id
0,2h3OD7psc46VriLAocmNXX
1,6n2q9hCLbIwKesMi4aFXZn
2,2QunlTaeHvNx773hR5mKjR
3,5pPeWvlH7T74gLBmUKtUde
4,70D5WTh5yGvDeRinBNE1Nw
...,...
13229,1wVyjIbWNzd9M8uTC9hKKz
13230,1ZDn83aJISoW0qXomuTg9c
13231,7uHXdPPYhu5TCwWsdtHJuk
13232,1V0zsWLBuuEgb5kbkIjO83


In [42]:
# No NaN's

df[df['track_id'].isnull()]

Unnamed: 0,track_id


In [43]:
# 3759 duplicates

df[df.duplicated()].shape

(3759, 1)

In [44]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,track_id
0,2h3OD7psc46VriLAocmNXX
1,6n2q9hCLbIwKesMi4aFXZn
2,2QunlTaeHvNx773hR5mKjR
3,5pPeWvlH7T74gLBmUKtUde
4,70D5WTh5yGvDeRinBNE1Nw
...,...
13226,331BqvDc63kvhtNuvvEgHj
13230,1ZDn83aJISoW0qXomuTg9c
13231,7uHXdPPYhu5TCwWsdtHJuk
13232,1V0zsWLBuuEgb5kbkIjO83


In [45]:
df.reset_index(drop=True, inplace=True)

In [46]:
df

Unnamed: 0,track_id
0,2h3OD7psc46VriLAocmNXX
1,6n2q9hCLbIwKesMi4aFXZn
2,2QunlTaeHvNx773hR5mKjR
3,5pPeWvlH7T74gLBmUKtUde
4,70D5WTh5yGvDeRinBNE1Nw
...,...
9470,331BqvDc63kvhtNuvvEgHj
9471,1ZDn83aJISoW0qXomuTg9c
9472,7uHXdPPYhu5TCwWsdtHJuk
9473,1V0zsWLBuuEgb5kbkIjO83


In [47]:
df.to_csv('9475_random_songs_2000-2020.csv', encoding='utf-8', index=False)

In [65]:
# Get audio_features

audio = [sp.audio_features(x) for x in df['track_id']]

In [66]:
audio_df = pd.DataFrame()

empty_row = {'danceability': np.nan,
 'energy': np.nan,
 'key': np.nan,
 'loudness': np.nan,
 'mode': np.nan,
 'speechiness': np.nan,
 'acousticness': np.nan,
 'instrumentalness': np.nan,
 'liveness': np.nan,
 'valence': np.nan,
 'tempo': np.nan,
 'type': np.nan,
 'id': np.nan,
 'uri': np.nan,
 'track_href': np.nan,
 'analysis_url': np.nan,
 'duration_ms': np.nan,
 'time_signature': np.nan}

for i in range(0,len(df['track_id'])):
    if type(audio[i][0]) != type(None):
        audio_df = audio_df.append(pd.json_normalize(audio[i][0]))
    else:
        audio_df = audio_df.append(pd.json_normalize(empty_row))

In [67]:
audio_df.isnull().sum()

danceability      51
energy            51
key               51
loudness          51
mode              51
                  ..
uri               51
track_href        51
analysis_url      51
duration_ms       51
time_signature    51
Length: 18, dtype: int64

In [68]:
audio_df.shape

(9475, 18)

In [69]:
audio_df[audio_df['danceability'].isnull()]

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,
0,,,,,,,,,,,,,,,,,,


In [70]:
audio_df.dropna(inplace=True)

In [71]:
audio_df.reset_index(drop=True, inplace=True)
audio_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.557,0.6550,1.0,-11.170,1.0,0.6350,0.571000,0.000000,0.8630,0.587,46.801,audio_features,2h3OD7psc46VriLAocmNXX,spotify:track:2h3OD7psc46VriLAocmNXX,https://api.spotify.com/v1/tracks/2h3OD7psc46V...,https://api.spotify.com/v1/audio-analysis/2h3O...,102373.0,4.0
1,0.550,0.3630,9.0,-12.857,1.0,0.0520,0.456000,0.000020,0.2970,0.565,142.977,audio_features,6n2q9hCLbIwKesMi4aFXZn,spotify:track:6n2q9hCLbIwKesMi4aFXZn,https://api.spotify.com/v1/tracks/6n2q9hCLbIwK...,https://api.spotify.com/v1/audio-analysis/6n2q...,457227.0,4.0
2,0.519,0.3030,4.0,-9.981,1.0,0.0417,0.911000,0.002560,0.0696,0.210,140.406,audio_features,2QunlTaeHvNx773hR5mKjR,spotify:track:2QunlTaeHvNx773hR5mKjR,https://api.spotify.com/v1/tracks/2QunlTaeHvNx...,https://api.spotify.com/v1/audio-analysis/2Qun...,418368.0,4.0
3,0.636,0.8440,1.0,-7.007,1.0,0.0443,0.000018,0.414000,0.2470,0.334,100.030,audio_features,5pPeWvlH7T74gLBmUKtUde,spotify:track:5pPeWvlH7T74gLBmUKtUde,https://api.spotify.com/v1/tracks/5pPeWvlH7T74...,https://api.spotify.com/v1/audio-analysis/5pPe...,169941.0,4.0
4,0.708,0.2870,8.0,-24.331,1.0,0.9050,0.152000,0.000039,0.2570,0.392,126.382,audio_features,70D5WTh5yGvDeRinBNE1Nw,spotify:track:70D5WTh5yGvDeRinBNE1Nw,https://api.spotify.com/v1/tracks/70D5WTh5yGvD...,https://api.spotify.com/v1/audio-analysis/70D5...,212964.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9419,0.718,0.1390,11.0,-20.712,0.0,0.9490,0.947000,0.000000,0.2040,0.598,120.855,audio_features,331BqvDc63kvhtNuvvEgHj,spotify:track:331BqvDc63kvhtNuvvEgHj,https://api.spotify.com/v1/tracks/331BqvDc63kv...,https://api.spotify.com/v1/audio-analysis/331B...,299572.0,4.0
9420,0.631,0.2360,4.0,-22.540,1.0,0.9390,0.676000,0.000000,0.2210,0.636,169.984,audio_features,1ZDn83aJISoW0qXomuTg9c,spotify:track:1ZDn83aJISoW0qXomuTg9c,https://api.spotify.com/v1/tracks/1ZDn83aJISoW...,https://api.spotify.com/v1/audio-analysis/1ZDn...,122100.0,4.0
9421,0.384,0.0821,6.0,-27.902,1.0,0.0499,0.995000,0.920000,0.1070,0.612,149.326,audio_features,7uHXdPPYhu5TCwWsdtHJuk,spotify:track:7uHXdPPYhu5TCwWsdtHJuk,https://api.spotify.com/v1/tracks/7uHXdPPYhu5T...,https://api.spotify.com/v1/audio-analysis/7uHX...,92507.0,4.0
9422,0.769,0.8600,7.0,-4.854,1.0,0.0349,0.000363,0.000004,0.0385,0.971,120.428,audio_features,1V0zsWLBuuEgb5kbkIjO83,spotify:track:1V0zsWLBuuEgb5kbkIjO83,https://api.spotify.com/v1/tracks/1V0zsWLBuuEg...,https://api.spotify.com/v1/audio-analysis/1V0z...,241773.0,4.0


In [72]:
audio_df['success'] = 0.0

In [73]:
audio_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,success
0,0.557,0.6550,1.0,-11.170,1.0,0.6350,0.571000,0.000000,0.8630,0.587,46.801,audio_features,2h3OD7psc46VriLAocmNXX,spotify:track:2h3OD7psc46VriLAocmNXX,https://api.spotify.com/v1/tracks/2h3OD7psc46V...,https://api.spotify.com/v1/audio-analysis/2h3O...,102373.0,4.0,0.0
1,0.550,0.3630,9.0,-12.857,1.0,0.0520,0.456000,0.000020,0.2970,0.565,142.977,audio_features,6n2q9hCLbIwKesMi4aFXZn,spotify:track:6n2q9hCLbIwKesMi4aFXZn,https://api.spotify.com/v1/tracks/6n2q9hCLbIwK...,https://api.spotify.com/v1/audio-analysis/6n2q...,457227.0,4.0,0.0
2,0.519,0.3030,4.0,-9.981,1.0,0.0417,0.911000,0.002560,0.0696,0.210,140.406,audio_features,2QunlTaeHvNx773hR5mKjR,spotify:track:2QunlTaeHvNx773hR5mKjR,https://api.spotify.com/v1/tracks/2QunlTaeHvNx...,https://api.spotify.com/v1/audio-analysis/2Qun...,418368.0,4.0,0.0
3,0.636,0.8440,1.0,-7.007,1.0,0.0443,0.000018,0.414000,0.2470,0.334,100.030,audio_features,5pPeWvlH7T74gLBmUKtUde,spotify:track:5pPeWvlH7T74gLBmUKtUde,https://api.spotify.com/v1/tracks/5pPeWvlH7T74...,https://api.spotify.com/v1/audio-analysis/5pPe...,169941.0,4.0,0.0
4,0.708,0.2870,8.0,-24.331,1.0,0.9050,0.152000,0.000039,0.2570,0.392,126.382,audio_features,70D5WTh5yGvDeRinBNE1Nw,spotify:track:70D5WTh5yGvDeRinBNE1Nw,https://api.spotify.com/v1/tracks/70D5WTh5yGvD...,https://api.spotify.com/v1/audio-analysis/70D5...,212964.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9419,0.718,0.1390,11.0,-20.712,0.0,0.9490,0.947000,0.000000,0.2040,0.598,120.855,audio_features,331BqvDc63kvhtNuvvEgHj,spotify:track:331BqvDc63kvhtNuvvEgHj,https://api.spotify.com/v1/tracks/331BqvDc63kv...,https://api.spotify.com/v1/audio-analysis/331B...,299572.0,4.0,0.0
9420,0.631,0.2360,4.0,-22.540,1.0,0.9390,0.676000,0.000000,0.2210,0.636,169.984,audio_features,1ZDn83aJISoW0qXomuTg9c,spotify:track:1ZDn83aJISoW0qXomuTg9c,https://api.spotify.com/v1/tracks/1ZDn83aJISoW...,https://api.spotify.com/v1/audio-analysis/1ZDn...,122100.0,4.0,0.0
9421,0.384,0.0821,6.0,-27.902,1.0,0.0499,0.995000,0.920000,0.1070,0.612,149.326,audio_features,7uHXdPPYhu5TCwWsdtHJuk,spotify:track:7uHXdPPYhu5TCwWsdtHJuk,https://api.spotify.com/v1/tracks/7uHXdPPYhu5T...,https://api.spotify.com/v1/audio-analysis/7uHX...,92507.0,4.0,0.0
9422,0.769,0.8600,7.0,-4.854,1.0,0.0349,0.000363,0.000004,0.0385,0.971,120.428,audio_features,1V0zsWLBuuEgb5kbkIjO83,spotify:track:1V0zsWLBuuEgb5kbkIjO83,https://api.spotify.com/v1/tracks/1V0zsWLBuuEg...,https://api.spotify.com/v1/audio-analysis/1V0z...,241773.0,4.0,0.0


In [74]:
audio_df.to_csv('9424_random_songs_with_audio_features.csv', encoding='utf-8', index=False)

In [9]:
# Getting 10,000 pseudo-random songs to balance the hit/not-hit songs datasets
# Year range same as selected for hit songs: 2000-2020

for i in range(0,10000):
    
    wildcard = [random.choice(string.ascii_letters + string.digits) + '%25',
                '%25' + random.choice(string.ascii_letters + string.digits),
                '%25' + random.choice(string.ascii_letters + string.digits) + '%25']
    
    random_char = random.choice(wildcard)
    
    search = sp.search(q=random_char + ' year:2000-2020',
                   type='track',
                   market='US', 
                   offset=random.randint(0,1990))
    
    track_id = search['tracks']['items'][0]['id']
    if track_id is None:
        track_id_list.append(np.nan)
    else:
        track_id_list.append(track_id)

In [10]:
len(track_id_list)

24290

In [11]:
df = pd.DataFrame()
df['track_id'] = track_id_list

In [12]:
df

Unnamed: 0,track_id
0,3jLui4jomVtV5IIxVdlXc6
1,7A6QJHLybuMSrb6yoXempj
2,4b12WupG6EZjCXUxDSUDg0
3,2HLhjO1RuzdAnbKN1NXasT
4,1K6u8GsGhj1i1YLdw5xW9k
...,...
24285,4G3TChSwzrsMNUBr8nniNl
24286,0hFoS6KHyNsw7ed6N34OHJ
24287,7ciGjTLjdKNGO6ypIhdirf
24288,6WmfR63P8pITWNOXvZCmE9


In [13]:
# No NaN's

df[df['track_id'].isnull()]

Unnamed: 0,track_id


In [14]:
# 9985 duplicates

df[df.duplicated()].shape

(9985, 1)

In [15]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,track_id
0,3jLui4jomVtV5IIxVdlXc6
1,7A6QJHLybuMSrb6yoXempj
2,4b12WupG6EZjCXUxDSUDg0
3,2HLhjO1RuzdAnbKN1NXasT
4,1K6u8GsGhj1i1YLdw5xW9k
...,...
14300,3kFljKgbLoEhcK4ywvHoIq
14301,60avJID2555xrhT5db85Kq
14302,0mI4jis8ruYzQHHmxFmKyi
14303,7ciGjTLjdKNGO6ypIhdirf


In [16]:
df.to_csv('14305_random_songs_2000-2020.csv', encoding='utf-8', index=False)

In [18]:
df2 = pd.read_csv('9475_random_songs_2000-2020.csv')

In [19]:
df3 = pd.concat([df, df2], ignore_index=True)

In [20]:
df3

Unnamed: 0,track_id
0,3jLui4jomVtV5IIxVdlXc6
1,7A6QJHLybuMSrb6yoXempj
2,4b12WupG6EZjCXUxDSUDg0
3,2HLhjO1RuzdAnbKN1NXasT
4,1K6u8GsGhj1i1YLdw5xW9k
...,...
23775,331BqvDc63kvhtNuvvEgHj
23776,1ZDn83aJISoW0qXomuTg9c
23777,7uHXdPPYhu5TCwWsdtHJuk
23778,1V0zsWLBuuEgb5kbkIjO83


In [21]:
df3[df3.duplicated()].shape

(4748, 1)

In [22]:
df3.drop_duplicates(inplace=True)
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,track_id
0,3jLui4jomVtV5IIxVdlXc6
1,7A6QJHLybuMSrb6yoXempj
2,4b12WupG6EZjCXUxDSUDg0
3,2HLhjO1RuzdAnbKN1NXasT
4,1K6u8GsGhj1i1YLdw5xW9k
...,...
19027,1tzc7FS2RoO8ywUIDtVTT8
19028,5jeZEby2UbZ68tIJrdOn5c
19029,461cvQ7z4CJ2RmLjGQyfu0
19030,331BqvDc63kvhtNuvvEgHj


In [23]:
df3.to_csv('19032_random_songs_2000-2020.csv', encoding='utf-8', index=False)