In [1]:
import json
import csv
import pandas as pd
import random as rd

import requests
import datetime
import base64
from urllib.parse import urlencode

## Spotify API Client setup

In [2]:
client_id = '7605835d7b0c4efd952bb9b511d824dc'
client_secret = '57cdae8d450c49f1b8b851f1fb4c31d7'

In [3]:
class SpotifyAPI(object):
    access_token = None
    access_token_expires = datetime.datetime.now()
    access_token_did_expire = True
    client_id = None
    client_secret = None
    token_url = 'https://accounts.spotify.com/api/token'
    
    def __init__(self, client_id, client_secret, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.client_id = client_id
        self.client_secret = client_secret
    
    def get_client_credentials(self):
        client_id = self.client_id
        client_secret = self.client_secret
        if client_secret == None or client_id == None:
            raise Exception('You must set client_id and client_secret')
        client_creds = f'{client_id}:{client_secret}'
        client_creds_b64 = base64.b64encode(client_creds.encode())
        return client_creds_b64.decode()
    
    def get_token_headers(self):
        client_creds_b64 = self.get_client_credentials()
        return {
            'Authorization': f'Basic {client_creds_b64}'
        }
    
    def get_token_data(self):
        return {
            'grant_type': 'client_credentials'
        }
    
    def perform_auth(self):
        token_url = self.token_url
        token_data = self.get_token_data()
        token_headers = self.get_token_headers()
        r = requests.post(token_url, data=token_data, headers=token_headers)
        if r.status_code not in range(200, 299):
            print(r)
            return False
        data = r.json()
        now = datetime.datetime.now()
        access_token = data['access_token']
        expires_in = data['expires_in']
        expires = now + datetime.timedelta(seconds=expires_in)
        self.access_token = access_token
        self.access_token_expires = expires
        self.access_token_did_expire = expires < now
        return True

In [4]:
client = SpotifyAPI(client_id, client_secret)
client.perform_auth()

True

In [123]:
headers = {
    'Authorization': f'Bearer {client.access_token}'
}

# Spotify million playlists dataset conversions

In [2]:
path_base = 'D:\schule\diplomarbeit\spotify_million_playlist_dataset\data'
n_playlists = 10000

## Training dataset - URM (Playlists x Tracks)
#### Format:
pid, track_uri, rating, playlist_name, track_name, artist_name

In [25]:
song_playlist_rows = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        for track in row['tracks']:
            song_playlist_rows.append([row['pid'], track['track_uri'], 1, row['name'], track['track_name'], track['artist_uri'], track['artist_name']])
mpd_df = pd.DataFrame(song_playlist_rows, columns=['pid', 'track_uri', 'rating', 'playlist_name', 'track_name', 'artist_uri','artist_name'])
mpd_df.to_csv(f'D:\schule\diplomarbeit\converted_csv\mpd_slice_0-{n_playlists-1}.csv', index=False)

## Training dataset - URM (Playlists x Artists)
#### Format:
pid, artist_uri, rating, playlist_name, artist_name, track_uri, track_name

In [24]:
artist_playlist_rows = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        tracks = row['tracks']
        for track in tracks:
            artist_uri = track['artist_uri']
            count = len(list((track_i['artist_uri'] for track_i in tracks if track_i['artist_uri'] == artist_uri)))
            rating = 1 - (1 / (1 + count))
            artist_playlist_rows.append([row['pid'], track['artist_uri'], rating, row['name'], track['artist_name'], track['track_uri'], track['track_name']])
mpd_df = pd.DataFrame(artist_playlist_rows, columns=['pid', 'artist_uri', 'rating', 'playlist_name', 'artist_name', 'track_uri', 'track_name'])
mpd_df.to_csv(f'D:\schule\diplomarbeit\converted_csv\playlists_x_artists_0-{n_playlists-1}_occ_playlist.csv', index=False)

## Unique tracks
#### Format:
track_uri, track_name, artist_uri, artist_name, album_uri, album_name, duration_ms

In [3]:
track_rows = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        for track in row['tracks']:
            track_rows.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
mpd_df = pd.DataFrame(track_rows, columns=['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])
mpd_df.drop_duplicates(subset='track_uri', keep='first', inplace=True)
mpd_df.to_csv(f'D:\\schule\\diplomarbeit\\converted_csv\\unique_tracks_0-{n_playlists-1}.csv', index=False)

## Unique artists
#### Format:
artist_uri, artist_name

In [5]:
artist_rows = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        for track in row['tracks']:
            artist_rows.append([track['artist_uri'], track['artist_name']])
mpd_df = pd.DataFrame(artist_rows, columns=['artist_uri', 'artist_name'])
mpd_df.drop_duplicates(subset='artist_uri', keep='first', inplace=True)
mpd_df.to_csv(f'D:\\schule\\diplomarbeit\\converted_csv\\unique_artists_0-{n_playlists-1}.csv', index=False)

## Playlist information
#### Format:
pid, name, num_tracks, num_albums, num_followers, num_artists, duration_ms

In [8]:
playlist_rows = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        playlist_rows.append([row['pid'], row['name'], row['num_tracks'], row['num_albums'], row['num_followers'], row['num_artists'], row['duration_ms']])
mpd_df = pd.DataFrame(playlist_rows, columns=['pid', 'name', 'num_tracks', 'num_albums', 'num_followers', 'num_artists', 'duration_ms'])
mpd_df.to_csv(f'D:\\schule\\diplomarbeit\\converted_csv\\playlists_0-{n_playlists-1}.csv', index=False)

## Minimized and combined MPD
#### Format:
<img src="./minimized_mpd_json.png" height="500" width="500">

In [7]:
playlists = []
for i in range(0, n_playlists, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    mpd_slice = json.load(open(path, 'r'))
    del mpd_slice['info']
    for playlist in mpd_slice['playlists']:
        del playlist['collaborative']
        del playlist['modified_at']
        del playlist['num_edits']
    playlists.extend(mpd_slice['playlists'])
    
with open(f'minimized_mpd_0-{n_playlists-1}.json', 'w') as f:
    json.dump(playlists, f, indent=4)

## Unique tracks
#### Format:
<img src="./unique_tracks_json.png" height="500" width="500">

In [5]:
unique_tracks_csv_file = "D:\\schule\\diplomarbeit\\converted_csv\\unique_tracks_0-9999.csv"
unique_tracks_json_file = "D:\\schule\\diplomarbeit\\converted_json\\unique_tracks_0-9999.json"
tracks = pd.read_csv(unique_tracks_csv_file, delimiter=',', encoding='utf-8', header=None, low_memory=False,
                       names=['track_uri','track_name','artist_uri','artist_name','album_uri','album_name','duration_ms'], skiprows=1)
tracks.to_json(unique_tracks_json_file, orient='records', indent=4)

# Spotify API

**Extract popularity for each track from spotify api**

In [124]:
endpoint = 'https://api.spotify.com/v1/tracks/?ids='
def get_popularities_of_tracks(chunk):
    splitted = list((uri.split(':')[2] for uri in chunk))
    params = ','.join(splitted)
    lookup_url = f'{endpoint}{params}'
    j = requests.get(lookup_url, headers=headers).json()
    popularities = []
    for track in j['tracks']:
        if(track is None):
            popularities.append(0)
            continue
        popularities.append(track['popularity'])
    return popularities

In [125]:
unique_tracks_file = "D:\\schule\\diplomarbeit\\converted_csv\\unique_tracks_0-1999.csv"

In [126]:
tracks = pd.read_csv(unique_tracks_file, delimiter=',', encoding='utf-8', header=None, low_memory=False,
                       names=['track_uri','track_name','artist_name'], skiprows=1)
tracks.head()

Unnamed: 0,track_uri,track_name,artist_name
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Beyoncé
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justin Timberlake
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Shaggy


In [128]:
chunks = [tracks[x:x+50]['track_uri'] for x in range(0, len(tracks), 50)]

In [129]:
popularity_list = []
for chunk in chunks:
    popularity_list.extend(get_popularities_of_tracks(list(chunk)))
print(len(popularity_list))

57884


In [331]:
backup = popularity_list.copy()

In [333]:
popularity_list.count(0)

43722

In [130]:
non_zero = list((popularity for popularity in popularity_list if popularity != 0))

In [131]:
average = round(sum(non_zero) / len(non_zero))

In [345]:
average

32

In [135]:
for i in range(len(popularity_list)):
    if(popularity_list[i] == 0):
        popularity_list[i] = average

**Append popularity column to unique tracks csv file**

In [136]:
tracks['popularity'] = popularity_list

In [137]:
tracks.head()

Unnamed: 0,track_uri,track_name,artist_name,popularity
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,67
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,79
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Beyoncé,24
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justin Timberlake,72
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Shaggy,38


In [138]:
tracks.to_csv(f'D:\schule\diplomarbeit\converted_csv\mpd_slice_0-1999_tracks_with_popularity_average.csv', index=False)

**Modify rating in mpd dataset based on popularity**

In [139]:
mpd_csv_file = "D:\schule\diplomarbeit\converted_csv\mpd_slice_0-1999.csv"

In [140]:
playlists = pd.read_csv(mpd_csv_file, delimiter=',', encoding='utf-8', header=None, low_memory=False,
                       names=['pid','track_uri','rating','playlist_name','track_name','artist_name'], skiprows=1)
playlists.head()

Unnamed: 0,pid,track_uri,rating,playlist_name,track_name,artist_name
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1,Throwbacks,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1,Throwbacks,Toxic,Britney Spears
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1,Throwbacks,Crazy In Love,Beyoncé
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1,Throwbacks,Rock Your Body,Justin Timberlake
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1,Throwbacks,It Wasn't Me,Shaggy


In [146]:
unique_tracks_population_file = "D:\\schule\\diplomarbeit\\converted_csv\\unique_tracks_with_popularity_average_0-1999.csv"

In [147]:
tracks = pd.read_csv(unique_tracks_population_file, delimiter=',', encoding='utf-8', header=None, low_memory=False,
                       names=['track_uri','track_name','artist_name', 'popularity'], skiprows=1)
tracks.head()

Unnamed: 0,track_uri,track_name,artist_name,popularity
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,67
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,Britney Spears,79
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Beyoncé,24
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justin Timberlake,72
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Shaggy,38


In [None]:
popularity_list_mpd = []
for index, row in playlists[:50000].iterrows():
    popularity = list(tracks.loc[tracks['track_uri'] == row['track_uri']]['population'])[0]
    popularity_list_mpd.append(popularity)
len(popularity_list_mpd)

In [148]:
len(tracks), len(playlists)

(57884, 134125)

In [108]:
playlists['popularity'] = popularity_list_mpd

In [None]:
playlists[:20]

In [89]:
for index, row in tracks[:1000].iterrows():
    #print(row['track_urit
    found_rows = playlists.index[playlists['track_uri'] == row['track_uri']].tolist()
    for found_row in found_rows:
        playlists.at[found_row, 'popularity'] = 1

In [149]:
for index, row in tracks.iterrows():
    playlists.loc[playlists['track_uri'] == row['track_uri'], ['rating']] = row['popularity']

In [150]:
playlists.to_csv(f'D:\schule\diplomarbeit\converted_csv\mpd_slice_0-{n_playlists-1}_popularity_average.csv', index=False)

# Testing

In [3]:
path = "D:\schule\diplomarbeit\MPD\MPD\mpd.slice.0-999.json"

In [13]:
f = open(path)
js = f.read()
f.close()
mpd_slice = json.loads(js)

In [40]:
len(mpd_slice['playlists'])

1000

In [None]:
playlist_names = (playlist['name'] for playlist in playlists)
for playlist in playlist_names:
    print(playlist)

In [None]:
playlists = list()
for x in range(0, 10000, 1000):
    playlists = mpd_slice['playlists'][x:x+1000]
    with open('mpd_file10000.csv', mode='a', encoding='utf-8') as mpd_file:
        writer = csv.writer(mpd_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        print(playlists[1])
        for playlist in playlists:
            playlist_name = playlist['name']
            for track in playlist['tracks']:
                track_name = track['track_name']
                writer.writerow([playlist_name, track_name, 1])


In [14]:
del mpd_slice['info']
for playlist in mpd_slice['playlists']:
    del playlist['collaborative']
    del playlist['modified_at']

with open('shrinked_mpd_test.json', 'w') as f:
    json.dump(mpd_slice, f)

In [20]:
path = f'{path_base}\mpd.slice.0-999.json'
d = json.load(open(path, 'r'))
tracks = list(d['playlists'][0]['tracks'])
artist_uri = 'spotify:artist:23zg3TcAtWQy7J6upgbUnj'
count_artist = len(list((track['artist_uri'] for track in tracks if track['artist_uri'] == artist_uri)))
count_artist

2

In [23]:
artist_playlist_rows = []
for i in range(0, 1000, 1000):
    path = f'{path_base}\mpd.slice.{i}-{i+999}.json'
    d = json.load(open(path, 'r'))
    mpd_slice = pd.DataFrame.from_dict(d['playlists'], orient='columns')
    for index, row in mpd_slice.iterrows():
        tracks = row['tracks']
        for track in tracks:
            artist_uri = track['artist_uri']
            count = len(list((track_i['artist_uri'] for track_i in tracks if track_i['artist_uri'] == artist_uri)))
            rating = 1 - (1 / (1 + count))
            artist_playlist_rows.append([row['pid'], track['artist_uri'], rating, row['name'], track['artist_name'], track['track_uri'], track['track_name']])
mpd_df = pd.DataFrame(artist_playlist_rows, columns=['pid', 'artist_uri', 'rating', 'playlist_name', 'artist_name', 'track_uri', 'track_name'])
mpd_df.to_csv(f'D:\\schule\\diplomarbeit\\converted_csv\\testing_0-{n_playlists-1}.csv', index=False)