In [1]:
import requests
import cnfg
import json
import pickle
import time
from datetime import datetime
from collections import defaultdict
from urlparse import urlparse
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [2]:
config = cnfg.load(".genius_config")
client_key = config["client_key"]
client_secret = config["client_secret"]
access_token = 'uK247ank6jF7DY_E7TEMIdbuhT2IBVsVeFXh6AluxswFUfrlFNfXNu2pUFPrGv2c'
auth = 'Bearer '+access_token
req_start = 'https://api.genius.com'

In [3]:
client = MongoClient()
db = client.music
s = db.songs
# counter
c = s.find()

In [4]:
def pickleLoad(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f) 
    return data

def pickleDump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [5]:
def api_call(validurl):
    call = validurl
    r = requests.get(call, headers={'Authorization':auth})
    if r.status_code==200:
        return json.loads(r.content)
    else:
        print r.content
        return r.content

### Get influencers
Influencers are all the artist ids in these categories: artist_id (primary artist), writers, producers, featured artists, description referents (artist only), annotated referents (artist only), and sampled song (primary artist). The below code gets all of those artists for all songs, ensures they are ints, unique-ifies the set and returns a list of influencers, called "artists"

In [6]:
artists = list(s.find({}, {'artist_id':1, '_id':0}))
artists = [int(x['artist_id']) for x in artists]

writers = list(s.find({}, {'writers':1, '_id':0}))
for writer in writers:
    if writer:
        for w in writer['writers']:
            artists.append(int(w['id']))
            
featured_artists = list(s.find({}, {'featured_artists':1, '_id':0}))
for fa in featured_artists:
    if fa:
        for w in fa['featured_artists']:
            artists.append(int(w['id']))

producers = list(s.find({}, {'producers':1, '_id':0}))
for p in producers:
    if p:
        for w in p['producers']:
            artists.append(int(w['id']))

desc_refs = list(s.find({}, {'description_refs.artist':1, 'description_refs.song':1, '_id':0}))
for dr in desc_refs:
    if 'artist' in dr['description_refs']:
        a = dr['description_refs']['artist']
        for artist in a:
            artists.append(int(artist['id']))
    if 'song' in dr['description_refs']:
        pass
    
anno_refs = list(s.find({}, {'references.artist':1, 'references.song':1, '_id':0}))
for ar in anno_refs:
    if 'artist' in ar['references']:
        a = ar['references']['artist']
        for artist in a:
            artists.append(int(artist['id']))
    if 'song' in ar['references']:
        pass
    
sampled_songs = list(s.find({}, {'sampled_songs':1, '_id':0}))
for samp in sampled_songs:
    if samp:
        for w in samp:
            for song in samp[w]:
                artists.append(song['primary_artist']['id'])

print "before uniquing: ", len(artists)
artists = [int(x) for x in artists]
artists = list(set(artists))
print "after uniquing: ", len(artists)

before uniquing:  2864
after uniquing:  1305


### calculate artist influence level on a given song
The weights are assigned below and can be adjusted. The songid of each song in the MongoDB is then grabbed and put in a list called 'songs'.

For a given song, this function will get artist ids for each of the categories in weights. Based on the number of contributors to each category, the influence of a single artist on that category is weighted evenly. Then the artist influence is divided by the total sum of the weights of given categories.

Returns a pandas Series with the weights for each artist and 0 for all artists that do not appear

In [7]:
weights = {'artist_id': 4., 'producers':10., 'writers': 10., 'featured_artists': 10., \
          'sampled_songs': 10., 'description_refs': 8., 'references': 7.}

In [8]:
songs = [int(x['id']) for x in list(s.find({'id':{'$exists':1}}, {'id':1, '_id':0}))]

In [9]:
def get_info(song, weights=weights, artists=artists):
    contributors = defaultdict(list)
    cont_weights = defaultdict(float)
    total = 0
    info = s.find({'id':song}, {'_id':0, 'description_refs.artist':1, 'description_refs.song':1, \
                       'sampled_songs':1, 'producers':1, 'writers':1, \
                       'featured_artists':1, 'artist_id':1, 'id':1}).next()
    for e in info:
        if e == 'artist_id':
            contributors['artist_id'].append(int(info[e]))
        if e == 'description_refs':
            if 'artist' in e:
                for a in e['artist']:
                    contributors['description_refs'].append(int(a['id']))
        if e == 'references' in info:
            if 'artist' in e:
                for a in e['artist']:
                    contributors['references'].append(int(a['id']))
        if e == 'producers':
            for a in info[e]:
                contributors['producers'].append(int(a['id']))
        if e == 'writers':
            for a in info[e]:
                contributors['writers'].append(int(a['id']))
        if e == 'featured_artists':
            for a in info[e]:
                contributors['featured_artists'].append(int(a['id']))
        if e == 'sampled_songs':
            for samp in info[e]:
                contributors['sampled_songs'].append(int(samp['primary_artist']['id']))
    for artist_type in contributors:
        total += weights[artist_type]
    for artist_type, influencers in contributors.iteritems():
        frac = weights[artist_type]/len(influencers)
        for influencer in influencers:
            cont_weights[influencer] += frac/total
    influences = {(a):(0 if a not in cont_weights.keys() else cont_weights[a]) for a in artists}
    influences_series = pd.Series(data=influences.values(), index=influences.keys())
    return influences_series

### make influencers x songs matrix
Use above equation to calculate the average influence of each artist and fill df. Columns are songs and rows are artists. The columns each sum to 1 because the influence for a given song is normalized.

In [10]:
df = pd.DataFrame(columns = songs)

In [11]:
df.shape

(0, 717)

In [12]:
for song in songs:
    try:
        series = get_info(song)
        df[song] = series
    except:
        print "unable to score songid: ", song
        continue

In [13]:
for col in df:
    thesum = df[col].sum()
    if thesum<.99999 or thesum>1.00001:
        print col, thesum

In [14]:
df.shape

(1305, 717)

Columns are song ids and rows are influencers (artist ids). There are 452 unique artists and 232 unique songs at this point in time. The sum of any given column should be 1 because the influence scores are normalized.

In [14]:
df

Unnamed: 0,653082,2396122,2415289,2369079,2411389,2304247,2396302,599451,2398213,2433137,...,694992,2057145,101077,58072,2388703,107655,493169,2155243,115451,725757
2048,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
1,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
2,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
3,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
4,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.416667,...,0.000000,0,0,0,0,0,0,0.285714,0,0
5,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
2057,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
329739,0,0,0,0.000000,0,0,0,0.000000,0.042017,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
13,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0
589827,0,0,0,0.000000,0,0,0,0.000000,0.000000,0.000000,...,0.000000,0,0,0,0,0,0,0.000000,0,0


The below code is a little sanity check. By seeing column for songid = 639914 in the above df, we see artistid of 1362 is .1111. (1/9) This validates that.

In [75]:
# first edit the get_info def to return contributors dict
# info = get_info(639914)

print info

#do the rest by hand 
total = 0
cont_weights = defaultdict(float)
for artist_type in info:
    total += weights[artist_type]
for artist_type, influencers in info.iteritems():
    frac = weights[artist_type]/len(influencers)
    for influencer in influencers:
        cont_weights[influencer] += frac/total
        if influencer == 1362:
            print influencer, artist_type, frac, total, frac/total

print cont_weights

defaultdict(<type 'list'>, {'producers': [574528, 290415], 'artist_id': [2358], 'writers': [574528, 290415, 1362]})
1362 writers 3.33333333333 30.0 0.111111111111
defaultdict(<type 'float'>, {574528: 0.2777777777777778, 1362: 0.11111111111111112, 2358: 0.3333333333333333, 290415: 0.2777777777777778})


In [15]:
pickleDump(df, 'songs_by_influencer_1305x717.pkl')

### make influencers x artist df
1. Get artist to song dictionary
2. Average the artists' songs to get an average influence on that artist
3. Pop into a df

In [18]:
artist_to_songs = defaultdict(list)

In [19]:
for x in df.columns:
    artist = int(s.find({"id":x}, {"artist_id":1, "_id":0}).next()['artist_id'])
    artist_to_songs[artist].append(x)

In [20]:
# number of primary artists in collection
len(artist_to_songs)

198

In [21]:
# sanity check that number of songs by artist is equivalent to number of songs
# expecting 355
allsongs = 0
for k, v in artist_to_songs.iteritems():
    allsongs += len(v)
print allsongs

355


In [22]:
artist_df = pd.DataFrame(columns = artist_to_songs)

In [23]:
total = 0
for k, v in artist_to_songs.iteritems():
    total = float(len(v))
    avgsong = df[v[0]]
    if len(v) > 1:
        for song in v[1:]:
            avgsong = avgsong + df[song]
    avgsong = avgsong/total
    artist_df[k] = avgsong

In [24]:
# Expecting (684, 198), e.g. (influencers x primary artists)
artist_df.shape

(684, 198)

### NMF Single Value Decomposition
Use SVD to reduce the number of features in the influencers x song matrix. From the (452, 232) matrix it will yield (452, 20) and (20, 232). These 20 features will correspond to both the influencers and the song. From there I can make recommendations on songs. NMF means non-negative matrix factorization. We want to ensure no negatives.

In [16]:
X = np.array(df)

In [17]:
model = NMF(n_components=20)

In [18]:
model.fit(X)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [19]:
len(model.components_)

20

In [20]:
components = model.components_

In [21]:
len(components[0])

717

In [22]:
nmfdf = pd.DataFrame(components, columns=songs)

In [23]:
nmfdf.shape

(20, 717)

In [24]:
song_by_songdf = pd.DataFrame(columns=songs)

In [25]:
for song in nmfdf:
    song_by_songdf[song] = pd.Series(data = [nmfdf[song].dot(nmfdf[other]) for other in nmfdf], index = songs)

In [26]:
pickleDump(song_by_songdf, 'songbysong_717x717.pkl')

In [55]:
top_ten_bby_one_more = list(song_by_songdf[78169].sort_values(ascending=False, inplace=False)[0:10].index)

In [56]:
top_ten_bby_one_more

[496456,
 1779037,
 1913178,
 2396122,
 501510,
 2268248,
 1800164,
 729457,
 2308425,
 2347642]

In [57]:
for num, related in enumerate(top_ten_bby_one_more):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  ZooWap  by  Fetty Wap
2 .  1Hunnid (Remix)  by  K Camp
3 .  Girls Born in the 90's  by  The Weeknd
4 .  Promise  by  Kid Ink
5 .  Love Me Harder  by  Ariana Grande
6 .  Valet  by  Eric Bellinger
7 .  Zoovier (Remix)  by  Fetty Wap
8 .  In the Night  by  The Weeknd
9 .  On My Mind  by  Ellie Goulding
10 .  Jimmy Choo  by  Fetty Wap


In [24]:
song_by_songdf[107655].sort_values(ascending=False, inplace=False)[0:10]

2422513    0.000032
1016       0.000032
1655       0.000032
2413886    0.000032
1932       0.000015
2045       0.000014
2213109    0.000014
2024226    0.000013
1812       0.000012
240199     0.000012
Name: 107655, dtype: float64

In [27]:
top_ten_unorthodox = list(song_by_songdf[107655].sort_values(ascending=False, inplace=False)[0:10].index)

In [25]:
top_ten_the_fix = song_by_songdf[2275742].sort_values(ascending=False, inplace=False)[0:10]

In [26]:
top_ten_the_fix

1966157    0.070926
35229      0.070926
32995      0.052320
35301      0.009499
2275742    0.009114
35293      0.009109
1913178    0.006619
501510     0.005345
729457     0.004671
542389     0.003498
Name: 2275742, dtype: float64

In [70]:
top_ten_the_fix = list(top_ten_the_fix.index)

In [71]:
top_ten_the_fix

[1966157, 35229, 32995, 35293, 2275742, 496456, 1913178, 1800164, 2443, 496445]

In [28]:
for num, related in enumerate(top_ten_unorthodox):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  OVO Sound Radio Episode 15 Tracklist  by  Drake
2 .  Houstatlantavegas  by  Drake
3 .  Hot 97 Freestyle  by  Drake
4 .  OVO Sound Radio Episode 14 Tracklist  by  Drake
5 .  In The Morning  by  J. Cole
6 .  Moment 4 Life  by  Nicki Minaj
7 .  Where Ya At  by  Future
8 .  R.I.C.O.  by  Meek Mill
9 .  What's My Name?  by  Rihanna
10 .  Heat of the Moment  by  Drake


In [38]:
work = list(song_by_songdf[2398213].sort_values(ascending=False, inplace=False)[0:10].index)

In [39]:
for num, related in enumerate(work):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  OVO Sound Radio Episode 14 Tracklist  by  Drake
2 .  OVO Sound Radio Episode 15 Tracklist  by  Drake
3 .  Houstatlantavegas  by  Drake
4 .  Hot 97 Freestyle  by  Drake
5 .  Heat of the Moment  by  Drake
6 .  Trust Issues  by  Drake
7 .  If You're Reading This It's Too Late [Album Art + Tracklist]  by  Drake
8 .  Back to Back  by  Drake
9 .  Company  by  Drake
10 .  In The Morning  by  J. Cole


In [40]:
work = list(song_by_songdf[2398213].sort_values(ascending=False, inplace=False)[10:20].index)

In [41]:
for num, related in enumerate(work):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  Summer Sixteen  by  Drake
2 .  Up All Night  by  Drake
3 .  Hotline Bling  by  Drake
4 .  Furthest Thing  by  Drake
5 .  No Tellin'  by  Drake
6 .  Started from the Bottom  by  Drake
7 .  Know Yourself  by  Drake
8 .  Worst Behavior  by  Drake
9 .  Moment 4 Life  by  Nicki Minaj
10 .  The Real Her  by  Drake


### Choose related song and generate playlist
The following functions, given a seed song, create a playlist based off of a chain of related songs. The first song in the playlist is always the seed song. The second song is chosen from the absolute value a random normal distribution with a high standard deviation (sigma = 16) such that the probability of choosing a song decreases as its rank increases.<br><br>
The next song in the playlist is chosen from the ranking of the related songs to the second song in the same manner. This iterative approach builds the playlist until the chosen number of songs have been added. The default playlist length is 20 songs.

In [27]:
def choose(seed_song):
    songlist = list(song_by_songdf[seed_song].sort_values(ascending=False, inplace=False).index)
    choose = abs(int(np.random.normal(0, 16)))
    while(choose >= len(songlist)):
        choose = abs(int(np.random.normal(0, 16)))
    choose = songlist[choose]
    return choose

In [49]:
playlist = []
playlistids = []
def generate_playlist(seed_song, playlist=playlist):
    info = s.find({"id":seed_song}, {"title":1, "artist":1}).next()
    playlist.append((info['title'], info['artist']))
    playlistids.append(seed_song)
    ch = choose(seed_song)
    while ch in playlistids:
        ch = choose(seed_song)
    return ch

In [32]:
def format_playlist(playlist):
    for num, track in enumerate(playlist):
        print num+1, ". ", track[0], " by ", track[1]

In [43]:
seed = 2396122
for x in range(20):
    seed = generate_playlist(seed)

In [44]:
format_playlist(playlist)

1 .  Promise  by  Kid Ink
2 .  Trap Queen  by  Fetty Wap
3 .  Zoovier (Remix)  by  Fetty Wap
4 .  ZooWap  by  Fetty Wap
5 .  Jimmy Choo  by  Fetty Wap
6 .  Girls Born in the 90's  by  The Weeknd
7 .  Valet  by  Eric Bellinger
8 .  Focus  by  Ariana Grande
9 .  On My Mind  by  Ellie Goulding
10 .  Love Me Harder  by  Ariana Grande
11 .  Again  by  Fetty Wap
12 .  Player  by  Tinashe
13 .  In the Night  by  The Weeknd
14 .  Hands To Myself  by  Selena Gomez
15 .  1Hunnid (Remix)  by  K Camp
16 .  Confident  by  Demi Lovato
17 .  Where Ya At (Future Ft. Drake Remix)  by  K Camp
18 .  Animals  by  Maroon 5
19 .  Whole Lotta Lovin'  by  DJ Mustard
20 .  Can't Feel My Face  by  The Weeknd


In [45]:
playlist = []
playlistids = []

In [50]:
seed = 78169
for x in range(20):
    seed = generate_playlist(seed)

In [51]:
format_playlist(playlist)

1 .  ...Baby One More Time  by  Britney Spears
2 .  Hands To Myself  by  Selena Gomez
3 .  1Hunnid (Remix)  by  K Camp
4 .  Zoovier (Remix)  by  Fetty Wap
5 .  Love Me Harder  by  Ariana Grande
6 .  Promise  by  Kid Ink
7 .  Trap Queen  by  Fetty Wap
8 .  I'm Different  by  2 Chainz
9 .  That's On You  by  Kid Ink
10 .  Animals  by  Maroon 5
11 .  Valet  by  Eric Bellinger
12 .  Focus  by  Ariana Grande
13 .  ZooWap  by  Fetty Wap
14 .  On My Mind  by  Ellie Goulding
15 .  Again  by  Fetty Wap
16 .  My Way  by  Fetty Wap
17 .  679  by  Fetty Wap
18 .  Player  by  Tinashe
19 .  Jimmy Choo  by  Fetty Wap
20 .  Style  by  Taylor Swift
