In [None]:
!pip install -U networkx
!pip install -U scipy

In [1]:
import math
import json
import scipy
import pandas as pd
import os
import os.path
import matplotlib.pyplot as plt
import numpy as np
import random
import networkx as nx
from pathlib import Path

# How to use this notebook

1. Download it
2. Download and extract the million songs dataset (link in the README.md on the repo) in the same directory as this notebook
3. Set the DIRECTORY constant in the cell below as the directory containing the dataset
4. You should then be able to run all of it

Section 1 is making a graph to figure out how the number of followers over all the playlists is distributed
Section 2 is removing all playlists from the dataset which have N or more fewer, and creating a new file containing playlists with only more than N followers
Section 3 is extracting N songs randomly from a given dataset, either randomly or with a weighted random distribution. You can set the constants to change the sampling method, number of songs, and the dataset you are computing over. If you use the default dataset to pick the songs, it is quite slow (takes like 10 minutes overall), so you may want to use a smaller dataset to choose these songs. I set the dataset to the dataset containing playlists with over 5 followers and it is significantly quicker to run. 

# Part 1: Definitions

## Section 1.1: Defining the edges of the network

In this section we explore several different means to quantify how similar one song is to another. We will use these similarity scores to assign weights to the edges of our network. We denote the weight of the edge from i to j as $w_{i,j}$




### Function that selects N songs

In [2]:
def select_N_songs(N, SPECIFIC_FILE, WEIGHTED_RANDOM = False):
    all_tracks = {} #stores all the songs
    f = open(SPECIFIC_FILE)
    data = json.load(f)

    for playlist in data['playlists']:
        for track in playlist['tracks']:
            name = track['track_name']+', by '+ track['artist_name']
            if name not in all_tracks:  #if it not in the dict then we add it to it and increase its weight
                all_tracks[name] = 1 #weight is used for weighted random sampling (if needed)
            else:
                all_tracks[name] +=1
    if WEIGHTED_RANDOM == False:
        songs = random.choices(list(all_tracks.keys()), weights=None, k=N)
    else:
        songs = random.choices(list(all_tracks.keys()), weights=all_tracks.values(), k=N)
    return set(songs)

In [3]:
#load some songs to test the methods below
################# Constants #################
SPECIFIC_FILE = 'generated_data/moreThan10followers.json' 
WEIGHTED_RANDOM = False   #If we want weighted random sampling or not
N = 1000  #number of songs to consider

songs = select_N_songs(N,SPECIFIC_FILE, WEIGHTED_RANDOM)

############################################

### Idea 1: Shared Followers
This method assigns a weight to each edge based on the sum of likes of each playlist that both songs in the edge appear in.

$w_{i,j} = \Sigma_{p \in P_{i,j}}{\normalsize \text{# likes of }p}$, where $P_{i,j}$ is the set of playlists contatining songs $i$ and $j$

In [4]:
def method1(directory, songs):
    '''
    Function that takes in a directory and N random songs and produces a network using the method 1 described above.
    
    Parameters:
        directory - the directory containing the data (assumed to be in a single .json file)
        songs - the N songs to select out of all the data
    Returns:
        edges - the constructed network
        relations - the node-node relations useful for certain recommendation algorithms
    '''
    shared_playlists = [] #this will contain items of the form (weight: [weight], tracks: [tracks]) for each playlist
    f = open(SPECIFIC_FILE) 
    data = json.load(f)

    for playlist in data['playlists']:  #parse through every playlist

        num_followers = playlist['num_followers']
        temp = []
        for track in playlist['tracks']: #if one of the random songs is in the playlist then remember it
            name = track['track_name']+', by '+ track['artist_name']
            if name in songs and name not in temp:
                temp.append(name)

        if (len(temp) >1): #only if a playlist has two or more of the 10k songs then we add it to our shared_playlists lits
            item = {'weight':num_followers, 'tracks':temp}
            shared_playlists.append(item)


    # #Here we create the big graph
    edges = {}
    relations = {} #this variable stores all of the node pairs in the graph. It is useful for the playlist-generation part of the notebook
    for song in songs:
        relations[song] = set({})

    for item in shared_playlists:

        weight = item['weight']
        tracks = item['tracks']
        for track1 in tracks:
            for track2 in tracks:
                if (track1 == track2):
                    continue
                #now we add relations
                if track2 not in relations[track1]:
                    relations[track1].add(track2)
                if track1 not in relations[track2]:
                    relations[track2].add(track1)

                #now create edge and weight
                if (track1,track2) not in edges and (track2,track1) not in edges:
                    edges[(track1,track2)] = weight
                    edges[(track2,track1)] = weight
                else:
                    edges[(track1,track2)] += weight
                    edges[(track2,track1)] += weight
    return edges,relations
        
#print(list(method1('generated_data/moreThan10followers.json',songs)[0].items())[0:5])


### Idea 2: Shared Playlists
The weight of each edge is the number of shared playlists of the song

$w_{i,j} = \normalsize |P_{i,j}|$


In [5]:
def method2(directory, songs):
    '''
    Function that takes in a directory and N random songs and produces a network using the method 2 described above.
    
    Parameters:
        directory - the directory containing the data (assumed to be in a single .json file)
        songs - the N songs to select out of all the data
    Returns:
        edges - the constructed network
        relations - the node-node relations useful for certain recommendation algorithms
    '''
    shared_playlists = [] #this will contain items of the form (weight: [weight], tracks: [tracks]) for each playlist
    f = open(SPECIFIC_FILE) 
    data = json.load(f)

    for playlist in data['playlists']:  #parse through every playlist

        num_followers = playlist['num_followers']
        temp = []
        for track in playlist['tracks']: #if one of the random songs is in the playlist then remember it
            name = track['track_name']+', by '+ track['artist_name']
            if name in songs and name not in temp:
                temp.append(name)

        if (len(temp) >1): #only if a playlist has two or more of the 10k songs then we add it to our shared_playlists lits
            item = {'weight':1, 'tracks':temp}
            shared_playlists.append(item)


    # #Here we create the big graph
    edges = {}
    relations = {} #this variable stores all of the node pairs in the graph. It is useful for the playlist-generation part of the notebook
    for song in songs:
        relations[song] = set({})

    for item in shared_playlists:

        weight = item['weight']
        tracks = item['tracks']
        for track1 in tracks:
            for track2 in tracks:
                if (track1 == track2):
                    continue
                #now we add relations
                if track2 not in relations[track1]:
                    relations[track1].add(track2)
                if track1 not in relations[track2]:
                    relations[track2].add(track1)

                #now create edge and weight
                if (track1,track2) not in edges and (track2,track1) not in edges:
                    edges[(track1,track2)] = weight
                    edges[(track2,track1)] = weight
                else:
                    edges[(track1,track2)] += weight
                    edges[(track2,track1)] += weight
    return edges,relations
        
#print(list(method2('generated_data/moreThan10followers.json',songs)[0].items())[0:5])


### Idea 3: Hybrid of 1 and 2
Since the follower numbers of the playlists are heavily skewed, ranging from 1 to over 70k, we experiment with weighting the shared playlist number higher than the shared follower number.

$w_{i,j} = \normalsize |P_{i,j}| + log_2({\Sigma_{p \in P_{i,j}}{\text{# likes of }p}})$

For our purposes, we use $log_2$ to weight the follower numbers as we think it suits our data well.


In [6]:
def method3(directory, songs):
    '''
    Function that takes in a directory and N random songs and produces a network using the method 2 described above.
    
    Parameters:
        directory - the directory containing the data (assumed to be in a single .json file)
        songs - the N songs to select out of all the data
    Returns:
        edges - the constructed network
        relations - the node-node relations useful for certain recommendation algorithms
    '''
    shared_playlists = [] #this will contain items of the form (weight: [weight], tracks: [tracks]) for each playlist
    f = open(SPECIFIC_FILE) 
    data = json.load(f)

    for playlist in data['playlists']:  #parse through every playlist

        num_followers = playlist['num_followers']
        temp = []
        for track in playlist['tracks']: #if one of the random songs is in the playlist then remember it
            name = track['track_name']+', by '+ track['artist_name']
            if name in songs and name not in temp:
                temp.append(name)

        if (len(temp) >1): #only if a playlist has two or more of the 10k songs then we add it to our shared_playlists lits
            item = {'weight':num_followers, 'tracks':temp}
            shared_playlists.append(item)


    # #Here we create the big graph
    edges = {}
    relations = {} #this variable stores all of the node pairs in the graph. It is useful for the playlist-generation part of the notebook
    for song in songs:
        relations[song] = set({})

    for item in shared_playlists:

        weight = item['weight']
        tracks = item['tracks']
        for track1 in tracks:
            for track2 in tracks:
                if (track1 == track2):
                    continue
                #now we add relations
                if track2 not in relations[track1]:
                    relations[track1].add(track2)
                if track1 not in relations[track2]:
                    relations[track2].add(track1)

                #now create edge and weight
                if (track1,track2) not in edges and (track2,track1) not in edges:
                    edges[(track1,track2)] = [1,weight]
                    edges[(track2,track1)] = [1,weight]
                else:
                    val = edges[(track1,track2)] #store value of previous shared playlists and shared followers
                    edges[(track1,track2)] += [val[0]+1, val[1]+weight]
                    edges[(track2,track1)] += [val[0]+1, val[1]+weight]
    
    for key in edges.keys():
        edges[key] = edges[key][0] + math.log(edges[key][1],2)
        
    return edges,relations
        
#print(list(method3('generated_data/moreThan10followers.json',songs)[0].items())[0:50])


### Idea 4: Shared Playlist Simliarity

Let $p_{i,j}$ denote the fraction of playlists that song $i$ belongs to which also contain song $j$. 
Then, $ w_{i,j} = p_{i,j}$

This method is different from the first three methods since the graph is directed in the sense that $w_{i,j}$ is not necessarily equal to $w_{j,i}$. However, if an edge exists from i to j then one must also exist from j to i.


In [13]:
def method4(SPECIFIC_FILE, Input_Songs):
    '''
    
    
    '''
    song_appearances ={}
    for song in Input_Songs:
        song_appearances[song] = set() #store the playlists for each song in a set 
        
    f = open(SPECIFIC_FILE) 
    data = json.load(f)
    
    playlist_index = 0 #stores the index of the playlist
    for playlist in data['playlists']:
        for track in playlist['tracks']: #if one of the random songs is in the playlist then remember it
            songname = track['track_name']+', by '+ track['artist_name']
            if songname in Input_Songs:
                #print('yes')
                song_appearances[songname].add(playlist_index)
        playlist_index+=1 
         

    #Here we create the big graph
    edges = {}
    relations = {} #this variable stores all of the node pairs in the graph. It is useful for the playlist-generation part of the notebook
    for song in songs:
        relations[song] = set({})
    
    for track1 in Input_Songs:
        for track2 in Input_Songs:
            if track1 == track2: #no self edges allowed
                continue
            common_playlists = song_appearances[track1].intersection(song_appearances[track2]) #find size of intersection
            
            if len(common_playlists) == 0: #if they share no playlist then no edge bw them
                continue
            else:
                relations[track1].add(track2)
                edges[(track1, track2)] = 100*(len(common_playlists)/len(song_appearances[track1])) #multiply by 100 for percent
        

    return edges,relations
                
                
            
                         
#mydata = method4('generated_data/moreThan10followers.json',songs)

#print(mydata[0])

### Idea 5: Shortest Paths

We have $w_{i,j} = d_{i,j}$, where $d_{i,j}$ denotes the shortest distance between nodes i and j.

This method produces good results however it is very computationally expensive (complexity of $O(N \cdot |V|^2)|$ where $N$ is final playlist size and $V$ is set of all vertices) so it is not practical for our applications of this algorithm. 

In [8]:
def method5(directory, songs):
    '''
    This method uses networkx functions that use dijkstra's algorithm to find the shortest path lengths between two nodes.
    We do this for all pairs to produce a graph. Since dijkstra's algorithm is O(|V|^2) and we call it for |V| nodes, this
    is the slowest method this far (in the form of its current implementation). Since our graph is not connected, we arbitrarily
    penalize a pair of unconnected nodes by assigning an edge weight between them of 10 (from the graph analysis we did, the 
    longest path between two nodes is less than 10 for all |songs|<10000)
    
    Parameters:
        directory - the directory containing the data (assumed to be in a single .json file)
        songs - the N songs to select out of all the data
    Returns:
        edges - the constructed network's edges, which are the shortest distances between two nodes.
        relations - the node-node relations useful for certain recommendation algorithms
    '''
    
    
    #first we need to call the function for method 2 to get the edges between nodes. then, we perform networkx stuff
    edges, relations = method2(directory, songs) #get edge relations between all nodes

    data = []
    for key in edges:
        temp = (key[0],key[1]) #make tuple of form (song1, song2, weight)
        data.append(temp)
    df = pd.DataFrame(data)
    df.columns = ["Source","Target"]
    G = nx.from_pandas_edgelist(df, "Source", "Target")  #generate graph
    #diameter = nx.diameter(G) #get diameter if no path between nodes exists
    diameter = 10 #set to 10 for now
    
    out_relations = {}
    for song in relations.keys():
        out_relations[song] = set({})
    out_edges = {}
    for song1 in relations.keys(): #iterate over all songs with at least one edge
        for song2 in relations.keys():
            if (song1 != song2):
                try:
                    length = nx.shortest_path_length(G,song1,song2)
                except:
                    length = diameter
                out_edges[(song1,song2)] = length
                out_relations[song1].add(song2)

    return out_edges, out_relations
        

#print(list(method5('generated_data/moreThan10followers.json',songs)[1].items())[0:50]) #test

# Section 1.2: Implementing algorithm to generate playlists

## 1.2.1: Star Graph Algorithm

Here I'll try to create the playlists using the star graph recommendation method. For this I need two things - first is a recommendation algorithm based on the edges, and second is an easy way to iterate over song connections. 

### How this works:

First we consider the one node case. We feed a single node, call it node A, into the algorithm which gives us the subset of the network with one edge at A, which is a star graph. We store this star graph in a dictionary object. The next song to be recommended is one of the edges in this graph, randomly chosen with a weighted probability distribution. Call this song B. Now, we want the playlist that we are generating to be the center of the star graph. Now, we make the center of the star graph represent both nodes A and B. We add all the edges with end to B (except those with end A) from the original dataset to the star graph. If an edge already exists between the center and another node, and B is also connected with that node, the new weight of that edge becomes the sum of the weights of those edges. This is to implement familiarity. Continue this until the desired playlist size is reached. 

Running the two cells below will initialize the network. There are some constants at the start that you need to input

In [9]:
def recommend_song(stargraph, songs, alpha = 1):
    '''
    Function that will implement the recommend algorithm for a given input star graph. We are assigning weights to every 
    song in the number of songs we are operating over. The weight of each song will be the weight of the edge from it to the
    center of the stargraph (0 otherwise) + alpha. We will then make a weighted random prediction to get the next song.
    
    Parameters:
        stargraph: the input star graph. The first element is the nodes inside the graph and the second one is a dictionary 
        of node:weight pairs
        songslist: the list of songs over which we are operating
        alpha: value to give to every song to allow possible recommendation of songs without any edges (teleportation). 1 by default
  
  Returns:
       The predicted song
   '''
    weights = []
    for i in range(len(songs)):
        song = songs[i]
        weight = alpha
        if song in stargraph[1]: #if there is already an edge to it
            weight+= stargraph[1][song]
        if song in stargraph[0]: #if it is already in our playlist
            weight = 0
        weights.append(weight)
    nextsong = random.choices(songs, weights, k=1)[0] #return our weighted random choice
    #print (nextsong)
    return nextsong

def update_graph(stargraph, songToAdd, relations, edges):
    '''
    Function that will update the star graph to include edges that belong to the song to add. Returns updated star graph
    
    Parameters:
        stargraph: the input star graph. The first element is the nodes inside the graph and the second one is a dictionary 
        of node:weight pairs
        songToAdd: the song to add
        relations: the dictionary of all pairs in the graph
        edges: the original network with all the node pair relations
   
   Returns:
        The updated star graph
    '''
    relationsToAdd = relations[songToAdd] #give a set of all the pairs of nodes with the song to add
    stargraphNodes = stargraph[0]
    stargraphEdges = stargraph[1]
    
    for song in relationsToAdd: #go through each pair belonging to this song
        
        if song not in stargraphNodes:
            if song not in stargraphEdges:
                stargraphEdges[song] = edges[(song,songToAdd)] #add value of edge from the original network
            else:
                stargraphEdges[song] += edges[(song,songToAdd)]
                
    stargraph[0].add(songToAdd)
    return stargraph
    
    
def create_playlist(inputSongs, length,  songs, relations, edges,alpha = 1):
    '''
    Function that generates a recommendation playlist of length length based on an input of songs, and an input network
    
    Parameters:
    inputSongs: The input songs (list)
    length: Number of songs to add
    songs: List of all songs
    relations: Dictionary of all pair relations in the graph
    edges: The original network
    alpha: The teleportation probability in the recommendation algorithm
    '''
    stargraph = (set(), dict())
    
    for song in inputSongs: #build the star graph
        stargraph = update_graph(stargraph, song, relations, edges) 
    
    for i in range(length):
        newSong = recommend_song(stargraph, songs, alpha)
        stargraph = update_graph(stargraph, newSong, relations, edges)
    
    return list(stargraph[0])
    
    
            
        
        
    
    

## Using the algorithm

The cells below test the algorithm

In [None]:
######################################### Constants

SPECIFIC_FILE = 'generated_data/moreThan10followers.json' #Rename this to something else if you want to only consider some other dataset, 
                    #otherwise we go over all data. specify the file name/path as the name
WEIGHTED_RANDOM = False   #If we want weighted random sampling or not
N = 1000  #number of songs to consider
METHOD = 'method1'  #define method to use 
METHODS = {'method1':method1, 'method2':method2, 'method3':method3}

In [None]:
######################################### LOADING SONGS FROM FILE
#first we need to load all the songs into a dictionary so that we can randomly select N of them
songs = list(select_N_songs(N,SPECIFIC_FILE, WEIGHTED_RANDOM))
#print(songs)

In [None]:
#print(songs)

In [None]:
#Testing the algorithm
inputPlaylist = []  #pick some random songs OR you can manually add some songs (in cell above)
for i in range(5):
    inputPlaylist.append(songs[random.randint(0,len(songs))])
length = len(inputPlaylist)+10
print("Input songs:")
print(inputPlaylist)

gen = []
for method in METHODS.keys():  
    edges, relations = method1(SPECIFIC_FILE,set(songs))
    gen.append(set(create_playlist(inputPlaylist, length, songs, relations, edges)))
    
for g in gen:
    x =list(g)
    x.sort()
    print(x)

In [None]:
print("Songs common in method 1 and 2:", (gen[0].intersection(gen[1])).difference(set(inputPlaylist)))
print("Songs common in method 1 and 3:", (gen[0].intersection(gen[2])).difference(set(inputPlaylist)))
print("Songs common in method 2 and 3:", (gen[1].intersection(gen[2])).difference(set(inputPlaylist)))

### Thoughts on algorithm

Perhaps in the future using the spotify audio features api (https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features) we could define some kind of metric to evaluate the success of each algorithm

## Section 1.2.2 Implementing Networkx

The purpose of this section is to try to implement method 5 into our playlist generation


In [None]:
path = 'presentation_data/presentation.csv' #file to be read
df = pd.read_csv(path)
songs = []
for song in df['Source']:
    songs.append(song)
print(df.head)

In [None]:
G = nx.from_pandas_edgelist(df, "Source", "Target", edge_attr = "Weight")  #generate graph

out = []

for song1 in songs:
    paths = nx.single_source_shortest_path_length(G, song1)
    vals = []
    for p in paths.keys():
        if p!=song1:
            vals.append(paths[p])
    out.append(vals)
print(len(out))
#print(nx.adjacency_matrix(G).shape)


In [None]:
print((np.array(out[1])-np.array(out[2])).sum())

In [None]:
#Playlist Generation using shortest paths method

input_songs = ['Straight Outta Compton, by N.W.A.']
out = input_songs
n = 5 # number of songs to add
sp = []

for s in input_songs:
    if sp == []:
        sp = nx.shortest_path(G,s)
        sp.pop(s)
    else:
        for key in sp.keys():
            sp[key]+= nx.shortest_path(G,s)[key]
        sp.pop(s)

        
for i in range (10):
    s = min(sp, key = sp.get) #get song with lowest combined shortest path
    out.append(s)
    for key in sp.keys():
        sp[key]+= nx.shortest_path(G,s)[key]
    sp.pop(s)
    
print(out)

In [None]:
def create_playlist_method5(input_songs, G):
    out = input_songs
    n = 10 # number of songs to add
    sp = []

    for s in input_songs:
        if sp == []:
            sp = nx.shortest_path(G,s)
            
            sp.pop(s) #remove this song from the shortest paths dictionary
        else:
            for key in sp.keys():
                sp[key]+= nx.shortest_path(G,s)[key]
            sp.pop(s)


    for i in range (10):
        s = min(sp, key = sp.get) #get song with lowest combined shortest path
        out.append(s)
        for key in sp.keys():
            sp[key]+= nx.shortest_path(G,s)[key]
        sp.pop(s)

    return out
    

# Part 2. Using the data

### IMPORTANT: Set the directory containing the data below

In [None]:
DIRECTORY = "spotify_million_playlist_dataset/data/"

## Section 2.1. Figuring out distribution of number of followers over playlists

This is just to figure out what the distribution of the followers looks like

In [None]:
# 1. Followers numbers
# 10,000 songs choose (Quanchi's link)


num_followers = []
directory = os.listdir(DIRECTORY)

for file in directory:

    f = open(DIRECTORY+file)
    data = json.load(f)


    for playlist in range(len(data['playlists'])):
        x = data['playlists'][playlist]['num_followers']
        num_followers += [x]

num_followers = pd.Series(num_followers)
num_followers.value_counts()


In [None]:
num_likes = {}

for val in num_followers:
    if val in num_likes:
        num_likes[val] += 1
    else:
        num_likes[val] = 1

keys = []
for key in num_likes.keys():
    keys += [key]

keys.sort()

X,y = [],[]

for key in keys:
    X +=[key]
    y +=[num_likes[key]]
    
    
#make the y-values a fraction of the size of the dataset
y_c = y     
for i in range(len(y_c)):
        y_c[i] = (y_c[i]+y_c[i-1])

for i in range(len(y_c)):
    y_c[i]/= 1000000

y_c


    
    

In [None]:
#Plot

plt.plot(X[:40], y[:40],linestyle="-", marker="o")
plt.ylabel("Fraction of playlists with followers <= than this number")
plt.xlabel("Number of Followers")
plt.xticks(np.arange(0,41,2))
plt.show()
f = plt.figure()
f.set_figwidth(1)
f.set_figheight(1)

In [None]:
print(1000000 - y_c[5]*1000000)


## Section 2.2. Removing the majority of the playlists

Removing all playlists with N or less followers 

In [None]:
N = 10
directory = os.listdir(DIRECTORY)

moreThanN = []

for file in directory:

    f = open(DIRECTORY+file)
    data = json.load(f)


    for playlist in range(len(data['playlists'])):
        if data['playlists'][playlist]['num_followers'] >N:  
            moreThanN += [data['playlists'][playlist]]

In [None]:
#sanity check

len(moreThanN)

In [None]:
#Create results folder

path = os.getcwd()+"/generated_data"
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:

   # Create a new directory because it does not exist
   os.makedirs(path)
   print("The new directory is created!")

In [None]:
#SAVING DATA
#convert to dictionary 
moreThanN_dict = {"playlists": moreThanN}

json_object = json.dumps(moreThanN_dict, indent=4)
with open(path+"/moreThan"+str(N)+"followers.json", "w") as outfile:
    outfile.write(json_object)



## Section 2.3. Choosing our songs and generating data

Selecting N songs randomly and performing an analysis on them.

Change the constants in the cell below to change the kind of data you create.

The output of these cells will be a .csv file in the generated_data folder.

In [None]:
################# Constants #################
'''Rename this constant to the file containing the data, otherwise it will go through all of the data.
I recommend using a file generated by the previous steps (like moreThan10followers.json) as the specific file (for instance, moreThan10followers.json) since
the computation times will be much much faster'''
SPECIFIC_FILE = 'generated_data/moreThan10followers.json' 

WEIGHTED_RANDOM = False   #If we want weighted random sampling or not
N = 1000  #number of songs to consider
METHOD = 'method1'  #define method to use 
METHODS = {'method1':method1, 'method2':method2, 'method3':method3, 'method4':method4} #methods  5 excluded for now
############################################

In [None]:
'''Randomly selecting N songs'''
songs = select_N_songs(N,SPECIFIC_FILE, WEIGHTED_RANDOM)

#songs = set(songs) #convert to set

In [None]:
'''Computing the graph using the desired method'''
edges =  METHODS[METHOD](SPECIFIC_FILE, songs)[0]                

In [None]:
print(len(edges))

In [None]:
'''SAVING THE DATA AS .CSV'''

#converting to pd

data = []
for key in edges:
    temp = (key[0],key[1],edges[key]) #make tuple of form (song1, song2, weight)
    data.append(temp)
data2 = pd.DataFrame(data)
data2.columns = ["Source","Target","Weight"]

#Create results folder

path = os.getcwd()+"/generated_data"
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:

   # Create a new directory because it does not exist
   os.makedirs(path)
   print("The new directory is created!")

#Writing the file

name = ""

if (WEIGHTED_RANDOM == True):
    name = str(N) + '_weighted_'+METHOD
else:
    name = str(N)+'_'+METHOD
filename = path+'/'+name+"_random_songs.csv"
filepath = Path(filename)
while (os.path.exists(filepath)):
    if filename[-5]=='s':
        filename = filename[:-4]+'2'+filename[-4:]
    else:
        filename = filename[:-5]+str(int(filename[-5])+1)+filename[-4:]
    filepath = Path(filename)

data2.to_csv(filepath, index = False)
    
    
#The data should be saved in the same folder as this notebook

## Section 2.4 Generating playlists using different methdos

In this section, we fix a network generated from S2.3 and we create playlists from songs in that network.

In [15]:
#Load the .csv file we are working with
SPECIFIC_FILE = 'generated_data/moreThan10followers.json' #remember directory of data
path = 'presentation_data/presentation.csv' #file to be read
#path = 'generated_data/1000_method1_random_songs.csv'
df = pd.read_csv(path)
G = nx.from_pandas_edgelist(df, "Source", "Target", edge_attr = "Weight")  #generate graph
songs = set()
for song in df['Source']:
    songs.add(song)
METHODS = {'method1':method1, 'method2':method2, 'method3':method3, 'method4':method4}

In [16]:
#Pick some song names - you could pick random ones instead by commenting out the line below and uncommenting the for loop in the next one
inputPlaylist = ['Jersey, by Future', 'Love Scars Pt. 2 / Rack City, by Trippie Redd', 'In My Room (feat. Ty Dolla $ign & Tyga), by Yellow Claw']
#for i in range(3):
#    inputPlaylist.append(songs[random.randint(0,len(songs))])
gen = []
length = 7

for method in METHODS.keys():  
    edges, relations = METHODS[method](SPECIFIC_FILE,set(songs))
    gen.append(set(create_playlist(inputPlaylist, length, list(songs), relations, edges)))

In [17]:
print("The input songs are " + str(inputPlaylist) +'\n')
for i in range(len(gen)):
    print("Playlist by method "+str(i+1)+': ')
    print(gen[i])
    print('\n\n')

The input songs are ['Jersey, by Future', 'Love Scars Pt. 2 / Rack City, by Trippie Redd', 'In My Room (feat. Ty Dolla $ign & Tyga), by Yellow Claw']

Playlist by method 1: 
{'Wowzerz, by Lil Wayne', 'Jersey, by Future', 'New Phone, Who Dis?, by Flatbush Zombies', 'Free Money, by K CAMP', 'Love Scars Pt. 2 / Rack City, by Trippie Redd', 'Swang, by Rae Sremmurd', 'A Tale of 2 Citiez, by J. Cole', 'Sparks (Turn off your Mind) - Atmozfears & Audiotricz Radio, by Fedde Le Grand', 'Get Right Witcha, by Migos', 'In My Room (feat. Ty Dolla $ign & Tyga), by Yellow Claw'}



Playlist by method 2: 
{'Respira - Club Mix Version, by Alkilados', 'Jersey, by Future', 'Love Scars Pt. 2 / Rack City, by Trippie Redd', 'Safe in the Steep Cliffs, by Emancipator', 'Comeback Kid, by Brett Dennen', 'Crossroad, by Alexandre Desplat', 'Hare un Altar para Ti, by Inspiraciòn', 'Bass Cannon - Dubstep Remix, by Swan', 'Let Me Praise Him, by Euclid Gray', 'In My Room (feat. Ty Dolla $ign & Tyga), by Yellow Claw'}
