In [1]:
import requests
import cnfg
import json
import pickle
import time
from datetime import datetime
from collections import defaultdict
from urlparse import urlparse
from pymongo import MongoClient
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [2]:
config = cnfg.load(".genius_config")
client_key = config["client_key"]
client_secret = config["client_secret"]
access_token = 'uK247ank6jF7DY_E7TEMIdbuhT2IBVsVeFXh6AluxswFUfrlFNfXNu2pUFPrGv2c'
auth = 'Bearer '+access_token
req_start = 'https://api.genius.com'

In [3]:
client = MongoClient()
db = client.music
s = db.songs
# counter
c = s.find()

In [4]:
def pickleLoad(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f) 
    return data

def pickleDump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [5]:
def api_call(validurl):
    call = validurl
    r = requests.get(call, headers={'Authorization':auth})
    if r.status_code==200:
        return json.loads(r.content)
    else:
        print r.content
        return r.content

### Get influencers
Influencers are all the artist ids in these categories: artist_id (primary artist), writers, producers, featured artists, description referents (artist only), annotated referents (artist only), and sampled song (primary artist). The below code gets all of those artists for all songs, ensures they are ints, unique-ifies the set and returns a list of influencers, called "artists"

In [6]:
artists = list(s.find({}, {'artist_id':1, '_id':0}))
artists = [int(x['artist_id']) for x in artists]

writers = list(s.find({}, {'writers':1, '_id':0}))
for writer in writers:
    if writer:
        for w in writer['writers']:
            artists.append(int(w['id']))
            
featured_artists = list(s.find({}, {'featured_artists':1, '_id':0}))
for fa in featured_artists:
    if fa:
        for w in fa['featured_artists']:
            artists.append(int(w['id']))

producers = list(s.find({}, {'producers':1, '_id':0}))
for p in producers:
    if p:
        for w in p['producers']:
            artists.append(int(w['id']))

desc_refs = list(s.find({}, {'description_refs.artist':1, 'description_refs.song':1, '_id':0}))
for dr in desc_refs:
    if 'artist' in dr['description_refs']:
        a = dr['description_refs']['artist']
        for artist in a:
            artists.append(int(artist['id']))
    if 'song' in dr['description_refs']:
        pass
    
anno_refs = list(s.find({}, {'references.artist':1, 'references.song':1, '_id':0}))
for ar in anno_refs:
    if 'artist' in ar['references']:
        a = ar['references']['artist']
        for artist in a:
            artists.append(int(artist['id']))
    if 'song' in ar['references']:
        pass
    
sampled_songs = list(s.find({}, {'sampled_songs':1, '_id':0}))
for samp in sampled_songs:
    if samp:
        for w in samp:
            for song in samp[w]:
                artists.append(song['primary_artist']['id'])

print "before uniquing: ", len(artists)
artists = [int(x) for x in artists]
artists = list(set(artists))
print "after uniquing: ", len(artists)

before uniquing:  1163
after uniquing:  452


### calculate artist influence level on a given song
The weights are assigned below and can be adjusted. The songid of each song in the MongoDB is then grabbed and put in a list called 'songs'.

For a given song, this function will get artist ids for each of the categories in weights. Based on the number of contributors to each category, the influence of a single artist on that category is weighted evenly. Then the artist influence is divided by the total sum of the weights of given categories.

Returns a pandas Series with the weights for each artist and 0 for all artists that do not appear

In [7]:
weights = {'artist_id': 10., 'producers':10., 'writers': 10., 'featured_artists': 8., \
          'sampled_songs': 7., 'description_refs': 5., 'references': 4.}

In [8]:
songs = [int(x['id']) for x in list(s.find({'id':{'$exists':1}}, {'id':1, '_id':0}))]

In [9]:
def get_info(song, weights=weights, artists=artists):
    contributors = defaultdict(list)
    cont_weights = defaultdict(float)
    total = 0
    info = s.find({'id':song}, {'_id':0, 'description_refs.artist':1, 'description_refs.song':1, \
                       'sampled_songs':1, 'producers':1, 'writers':1, \
                       'featured_artists':1, 'artist_id':1, 'id':1}).next()
    for e in info:
        if e == 'artist_id':
            contributors['artist_id'].append(int(info[e]))
        if e == 'description_refs':
            if 'artist' in e:
                for a in e['artist']:
                    contributors['description_refs'].append(int(a['id']))
        if e == 'references' in info:
            if 'artist' in e:
                for a in e['artist']:
                    contributors['references'].append(int(a['id']))
        if e == 'producers':
            for a in info[e]:
                contributors['producers'].append(int(a['id']))
        if e == 'writers':
            for a in info[e]:
                contributors['writers'].append(int(a['id']))
        if e == 'featured_artists':
            for a in info[e]:
                contributors['featured_artists'].append(int(a['id']))
        if e == 'sampled_songs':
            for samp in info[e]:
                contributors['sampled_songs'].append(int(samp['primary_artist']['id']))
    for artist_type in contributors:
        total += weights[artist_type]
    for artist_type, influencers in contributors.iteritems():
        frac = weights[artist_type]/len(influencers)
        for influencer in influencers:
            cont_weights[influencer] += frac/total
    influences = {(a):(0 if a not in cont_weights.keys() else cont_weights[a]) for a in artists}
    influences_series = pd.Series(data=influences.values(), index=influences.keys())
    return influences_series

### make influencers x songs matrix
Use above equation to calculate the average influence of each artist and fill df. Columns are songs and rows are artists. The columns each sum to 1 because the influence for a given song is normalized.

In [10]:
df = pd.DataFrame(columns = songs)

In [11]:
df.shape

(0, 232)

In [12]:
for song in songs:
    try:
        series = get_info(song)
        df[song] = series
    except:
        print "unable to score songid: ", song
        continue

In [13]:
for col in df:
    thesum = df[col].sum()
    if thesum<.99999 or thesum>1.00001:
        print col, thesum

In [14]:
df.shape

(452, 232)

Columns are song ids and rows are influencers (artist ids). There are 452 unique artists and 232 unique songs at this point in time. The sum of any given column should be 1 because the influence scores are normalized.

In [15]:
df

Unnamed: 0,653082,2396122,2415289,2369079,2411389,2304247,2396302,599451,2398213,2433137,...,217270,1800164,47702,58349,73198,2045,517621,164344,639914,505193
1,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
2,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
3,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
4,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.285714,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
2057,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
329739,0,0,0,0.0,0,0.000000,0,0.0,0.037594,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
13,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
589827,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
20503,0,0,0,0.5,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0
589828,0,0,0,0.0,0,0.000000,0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0.000000,0.000000,0.0


The below code is a little sanity check. By seeing column for songid = 639914 in the above df, we see artistid of 1362 is .1111. (1/9) This validates that.

In [75]:
# first edit the get_info def to return contributors dict
# info = get_info(639914)

print info

#do the rest by hand 
total = 0
cont_weights = defaultdict(float)
for artist_type in info:
    total += weights[artist_type]
for artist_type, influencers in info.iteritems():
    frac = weights[artist_type]/len(influencers)
    for influencer in influencers:
        cont_weights[influencer] += frac/total
        if influencer == 1362:
            print influencer, artist_type, frac, total, frac/total

print cont_weights

defaultdict(<type 'list'>, {'producers': [574528, 290415], 'artist_id': [2358], 'writers': [574528, 290415, 1362]})
1362 writers 3.33333333333 30.0 0.111111111111
defaultdict(<type 'float'>, {574528: 0.2777777777777778, 1362: 0.11111111111111112, 2358: 0.3333333333333333, 290415: 0.2777777777777778})


In [16]:
pickleDump(df, 'songs_by_influencer_452x232.pkl')

### make influencers x artist df
1. Get artist to song dictionary
2. Average the artists' songs to get an average influence on that artist
3. Pop into a df

In [16]:
artist_to_songs = defaultdict(list)

In [17]:
for x in df.columns:
    artist = int(s.find({"id":x}, {"artist_id":1, "_id":0}).next()['artist_id'])
    artist_to_songs[artist].append(x)

In [18]:
# number of primary artists in collection
len(artist_to_songs)

109

In [19]:
# sanity check that number of songs by artist is equivalent to number of songs
allsongs = 0
for k, v in artist_to_songs.iteritems():
    allsongs += len(v)
print allsongs

232


In [20]:
artist_df = pd.DataFrame(columns = artist_to_songs)

In [21]:
total = 0
for k, v in artist_to_songs.iteritems():
    total = float(len(v))
    avgsong = df[v[0]]
    if len(v) > 1:
        for song in v[1:]:
            avgsong = avgsong + df[song]
    avgsong = avgsong/total
    artist_df[k] = avgsong

In [22]:
# Expecting (452, 109), e.g. (influencers x primary artists)
artist_df.shape

(452, 109)

### NMF Single Value Decomposition
Use SVD to reduce the number of features in the influencers x song matrix. From the (452, 232) matrix it will yield (452, 20) and (20, 232). These 20 features will correspond to both the influencers and the song. From there I can make recommendations on songs. NMF means non-negative matrix factorization. We want to ensure no negatives.

In [23]:
X = np.array(df)

In [24]:
model = NMF(n_components=20)

In [25]:
model.fit(X)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [26]:
len(model.components_)

20

In [27]:
components = model.components_

In [28]:
len(components[0])

232

In [29]:
nmfdf = pd.DataFrame(components, columns=songs)

In [30]:
nmfdf.shape

(20, 232)

In [31]:
song_by_songdf = pd.DataFrame(columns=songs)

In [32]:
for song in nmfdf:
    song_by_songdf[song] = pd.Series(data = [nmfdf[song].dot(nmfdf[other]) for other in nmfdf], index = songs)

In [33]:
song_by_songdf[2396122].sort_values(ascending=False, inplace=False)[0:10]

496456     0.297429
1800164    0.184203
496445     0.166771
2347642    0.166771
2390938    0.120399
2396122    0.096972
696428     0.094260
653082     0.093998
710897     0.037487
332635     0.010447
Name: 2396122, dtype: float64

In [34]:
top_ten_promise = song_by_songdf[2396122].sort_values(ascending=False, inplace=False)[0:10]

In [35]:
top_ten_promise = list(top_ten_promise.index)

In [36]:
top_ten_promise

[496456,
 1800164,
 496445,
 2347642,
 2390938,
 2396122,
 696428,
 653082,
 710897,
 332635]

In [37]:
for num, related in enumerate(top_ten_promise):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  ZooWap  by  Fetty Wap
2 .  Zoovier (Remix)  by  Fetty Wap
3 .  Trap Queen  by  Fetty Wap
4 .  Jimmy Choo  by  Fetty Wap
5 .  Merry Xmas  by  Fetty Wap
6 .  Promise  by  Kid Ink
7 .  679  by  Fetty Wap
8 .  My Way  by  Fetty Wap
9 .  Beautiful  by  Remy Boyz
10 .  Or Nah  by  Ty Dolla $ign


In [38]:
work = list(song_by_songdf[2398213].sort_values(ascending=False, inplace=False)[0:10].index)

In [39]:
for num, related in enumerate(work):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  OVO Sound Radio Episode 14 Tracklist  by  Drake
2 .  OVO Sound Radio Episode 15 Tracklist  by  Drake
3 .  Houstatlantavegas  by  Drake
4 .  Hot 97 Freestyle  by  Drake
5 .  Heat of the Moment  by  Drake
6 .  Trust Issues  by  Drake
7 .  If You're Reading This It's Too Late [Album Art + Tracklist]  by  Drake
8 .  Back to Back  by  Drake
9 .  Company  by  Drake
10 .  In The Morning  by  J. Cole


In [40]:
work = list(song_by_songdf[2398213].sort_values(ascending=False, inplace=False)[10:20].index)

In [41]:
for num, related in enumerate(work):
    r = s.find({"id":related}, {"title":1, "artist":1}).next()
    print num+1, ". ", r['title'], " by ", r['artist']

1 .  Summer Sixteen  by  Drake
2 .  Up All Night  by  Drake
3 .  Hotline Bling  by  Drake
4 .  Furthest Thing  by  Drake
5 .  No Tellin'  by  Drake
6 .  Started from the Bottom  by  Drake
7 .  Know Yourself  by  Drake
8 .  Worst Behavior  by  Drake
9 .  Moment 4 Life  by  Nicki Minaj
10 .  The Real Her  by  Drake


In [42]:
def choose(seed_song):
    songlist = list(song_by_songdf[seed_song].sort_values(ascending=False, inplace=False).index)
    choose = abs(int(np.random.normal(0, 16)))
    while(choose >= len(songlist)):
        choose = abs(int(np.random.normal(0, 16)))
    choose = songlist[choose]
    return choose

In [116]:
playlist = []
playlistids = []
def generate_playlist(seed_song, playlist=playlist):
    info = s.find({"id":seed_song}, {"title":1, "artist":1}).next()
    playlist.append((info['title'], info['artist']))
    playlistids.append(seed_song)
    ch = choose(seed_song)
    while ch in playlistids:
        ch = choose(seed_song)
    return ch

In [59]:
def format_playlist(playlist):
    for num, track in enumerate(playlist):
        print num+1, ". ", track[0], " by ", track[1]

In [91]:
seed = 2398213

In [92]:
for x in range(20):
    seed = generate_playlist(seed)

In [93]:
format_playlist(playlist)

1 .  OVO Sound Radio Episode 14 Tracklist  by  Drake
2 .  OVO Sound Radio Episode 15 Tracklist  by  Drake
3 .  Know Yourself  by  Drake
4 .  Houstatlantavegas  by  Drake
5 .  Trust Issues  by  Drake
6 .  No New Friends  by  DJ Khaled
7 .  FuckWithMeYouKnowIGotIt  by  Jay Z
8 .  Hood Billionaire Tracklist  by  Rick Ross
9 .  Millions  by  Pusha T
10 .  Where Ya At  by  Future
11 .  Fuck Up Some Commas  by  Future
12 .  Rich $ex  by  Future
13 .  My Savages  by  Future
14 .  Dirty Sprite  by  Future
15 .  I'm the Plug  by  Drake & Future
16 .  Plastic Dreams  by  G-Eazy
17 .  Or Nah  by  Ty Dolla $ign
18 .  Jimmy Choo  by  Fetty Wap
19 .  My Way  by  Fetty Wap
20 .  Last Call  by  J. Cole
21 .  Work  by  Rihanna
22 .  In The Morning  by  J. Cole
23 .  Under Ground Kings  by  Drake
24 .  R.I.C.O.  by  Meek Mill
25 .  Summer Sixteen  by  Drake
26 .  Up All Night  by  Drake
27 .  Back to Back  by  Drake
28 .  Heat of the Moment  by  Drake
29 .  Take Care  by  Drake
30 .  If You're Reading T

In [99]:
playlist = []
playlistids = []

In [119]:
seed = 78169
for x in range(20):
    seed = generate_playlist(seed)

In [120]:
format_playlist(playlist)

1 .  ...Baby One More Time  by  Britney Spears
2 .  Body  by  Dreezy
3 .  Crew Love  by  Drake
4 .  Girls Born in the 90's  by  The Weeknd
5 .  Blank Space  by  Taylor Swift
6 .  Love Me Harder  by  Ariana Grande
7 .  Low Life  by  Future
8 .  EVOL [Tracklist + Cover Art]  by  Future
9 .  Rich $ex  by  Future
10 .  Photo Copied  by  Future
11 .  Same Damn Time  by  Future
12 .  DS2 Cover Art + Tracklist  by  Future
13 .  Diamonds Dancing  by  Drake & Future
14 .  Houstatlantavegas  by  Drake
15 .  OVO Sound Radio Episode 14 Tracklist  by  Drake
16 .  Up All Night  by  Drake
17 .  Hotline Bling  by  Drake
18 .  Company  by  Drake
19 .  What's My Name?  by  Rihanna
20 .  Trust Issues  by  Drake
21 .  ...Baby One More Time  by  Britney Spears
22 .  The Hills  by  The Weeknd
23 .  Woo  by  Rihanna
24 .  Work  by  Rihanna
25 .  Summer Sixteen  by  Drake
26 .  Moment 4 Life  by  Nicki Minaj
27 .  OVO Sound Radio Episode 15 Tracklist  by  Drake
28 .  Champion  by  Nicki Minaj
29 .  Started fr