In [4]:
import numpy as np
import pandas as pd
import json
import os
import random
import string
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

pd.set_option("max_rows", 10)

In [2]:
# Reading Spotify web API credentials from settings.env hidden file

with open('settings.env') as f:
    env_vars = json.loads(f.read())

# Set environment variables
os.environ['SPOTIPY_CLIENT_ID'] = env_vars['SPOTIPY_CLIENT_ID']
os.environ['SPOTIPY_CLIENT_SECRET'] = env_vars['SPOTIPY_CLIENT_SECRET']

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [5]:
# Getting 8100 random songs to balance the hit/not-hit songs datasets
# Year range same as selected for hit songs: 2000-2020

track_id_list = []

for i in range(0,8100):
    search = sp.search(q=random.choice(string.ascii_letters + string.digits) + ' year:2000-2020',
                   type='track',
                   market='US', 
                   offset=random.randint(0,1990))
    track_id = search['tracks']['items'][0]['id']
    track_id_list.append(track_id)

In [9]:
len(track_id_list)

8100

In [6]:
df = pd.DataFrame()
df['track_id'] = track_id_list

In [10]:
df

Unnamed: 0,track_id
0,608a1wIsSd5KzMEqm1O7w3
1,33VcQq9PLgJhMllZtPRP4M
2,5mI9HJdkqtqWr7KsA8hIbt
3,4FDG9SHyQkxkJxGLrF1ZIp
4,21kOVEG3bDCVphKhXL8XmQ
...,...
8095,6YXG34IWbilypPwjBTOrUC
8096,08cXy6KUizaAelYXtcew3w
8097,6Hmj7SrLRbreLVfVS7mV1S
8098,30AW1wD8qK1Bqw3Z44xs3f


In [11]:
# No NaN's

df[df['track_id'].isnull()]

Unnamed: 0,track_id


In [13]:
# 1837 duplicates

df[df.duplicated()].shape

(1837, 1)

In [14]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,track_id
0,608a1wIsSd5KzMEqm1O7w3
1,33VcQq9PLgJhMllZtPRP4M
2,5mI9HJdkqtqWr7KsA8hIbt
3,4FDG9SHyQkxkJxGLrF1ZIp
4,21kOVEG3bDCVphKhXL8XmQ
...,...
8092,6MBPfkgkEpivlswUuTMdj0
8095,6YXG34IWbilypPwjBTOrUC
8096,08cXy6KUizaAelYXtcew3w
8098,30AW1wD8qK1Bqw3Z44xs3f


In [40]:
df.reset_index(drop=True, inplace=True)

In [42]:
df.to_csv('6262_random_songs_2000-2020.csv', encoding='utf-8', index=False)

In [56]:
# Getting 4000 more random songs to balance the hit/not-hit songs datasets
# Year range same as selected for hit songs: 2000-2020

track_id_list = []

for i in range(0,4000):
    search = sp.search(q=random.choice(string.ascii_letters + string.digits) + ' year:2000-2020',
                   type='track',
                   market='US', 
                   offset=random.randint(0,1990))
    track_id = search['tracks']['items'][0]['id']
    track_id_list.append(track_id)

In [57]:
len(track_id_list)

4000

In [58]:
df = pd.DataFrame()
df['track_id'] = track_id_list

In [59]:
df

Unnamed: 0,track_id
0,4ek3pPdfvIbPxynmlisI0k
1,2aibwv5hGXSgw7Yru8IYTO
2,3LUWWox8YYykohBbHUrrxd
3,1ckLp8lCl8LipXI0ypX72m
4,3fBbfrWZUuWWrMQXFISr7N
...,...
3995,6e7hIhOLH9zvb3zP5O5gt0
3996,0CLibGiioSuyci4NSbYi9q
3997,6otiaV2fagE3s8IvP6WkwG
3998,6mifT2myhvgBlPpf8kyHT7


In [60]:
# No NaN's

df[df['track_id'].isnull()]

Unnamed: 0,track_id


In [62]:
# 501 duplicates

df[df.duplicated()].shape

(501, 1)

In [63]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,track_id
0,4ek3pPdfvIbPxynmlisI0k
1,2aibwv5hGXSgw7Yru8IYTO
2,3LUWWox8YYykohBbHUrrxd
3,1ckLp8lCl8LipXI0ypX72m
4,3fBbfrWZUuWWrMQXFISr7N
...,...
3494,6V81K1OcvYgHEnCA6A2Qx9
3495,4D1eVq5AUILwjg3tAe7o6M
3496,6otiaV2fagE3s8IvP6WkwG
3497,6mifT2myhvgBlPpf8kyHT7


In [64]:
df.to_csv('3499_random_songs_2000-2020.csv', encoding='utf-8', index=False)