In [1]:
from tswift import Artist, Song, TswiftError
import random

import numpy as np
import torch
from sklearn.mixture import GaussianMixture

import codecs
import pickle
import logging
import nltk
stopwords = nltk.corpus.stopwords
remove_these = set(stopwords.words('english'))

def load_external_embeddings(params, emb_path):
    """
    Reload pretrained embeddings from a text file.
    """
    
    word2id = {}
    vectors = []

    # load pretrained embeddings
    _emb_dim_file = params.emb_dim
    with codecs.open(emb_path) as f:
        for i, line in enumerate(f):
            if len(line.split()) == 2:
                i -= 1
                continue
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                vect[0] = 0.01
            assert word not in word2id
            assert vect.shape == (_emb_dim_file,), i
            word2id[word] = len(word2id)
            vectors.append(vect[None])

    logging.info("Loaded %i pre-trained word embeddings" % len(vectors))
    
    dico = word2id
    
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()
    embeddings = embeddings.cuda() if params.cuda and torch.cuda.is_available() else embeddings
    assert embeddings.size() == (len(word2id), params.emb_dim), ((len(word2id), params.emb_dim, embeddings.size()))

    return dico, embeddings

pkl_file = open("model/ft_params.pkl", 'rb')
params = pickle.load(pkl_file)
out_dico, out_emb = load_external_embeddings(params, "data/ft_postspec.txt")

In [11]:
punctuation = ['.', ',', '?', '!', '(', ')']

artists = [Artist('Kendrick Lamar'), Artist('Radiohead')]

words_count = 0 # Num of words used in all songs

word_set = {} # word -> overall count
words = [] # word_idx -> word_str
word2idx = {} # word_str -> word_idx
word_emb_dict = {} # word_str -> embedding

songs = [] # song_idx -> song
song_words = [] # song_idx -> (word, count)
song2idx = {} # song -> song_idx
song_lyric_lengths = {}

for artist in artists:
    for song in artist.songs:
    #song = artist.songs[163] 
        try:
            lyrics = song.lyrics
        except TswiftError:
            # print(song.title)
            continue

        if song.lyrics == "":
            # print(song.title)
            continue

        lyrics = lyrics.replace('\n', ' ')
        for p in punctuation:
            lyrics = lyrics.replace(p, '')
        lyrics = [word.lower() for word in lyrics.split(' ') if word != ""]
        lyrics = [word.replace("\'", "") for word in lyrics if word not in remove_these]
        lyrics = [word for word in lyrics if "en_" + word in out_dico]

        song_word_set = {}
        for word in lyrics:
            words_count += 1
            if word in song_word_set:
                song_word_set[word] += 1
            else:
                song_word_set[word] = 1
            if word in word_set:
                word_set[word] += 1
            else:
                word_set[word] = 1

        songs.append(song.title + " --- " + artist.name)
        song_words.append(song_word_set)
        song2idx[song.title] = len(songs) - 1
        song_lyric_lengths[song.title] = len(lyrics)


    
song_words = [
                [
                    (word, (
                            song_word_set[word] 
                            # * (len(song_word_set) / song_lyric_lengths[songs[i]])
                            * (words_count / (len(word_set) * word_set[word]))
                        )   
                    ) for word in list(song_word_set.keys())
                ] for i,song_word_set in enumerate(song_words)
             ]
for i, word in enumerate(list(word_set.keys())):
    words.append(word)
    word2idx[word] = i
    word_emb_dict[word] = out_emb[out_dico["en_" + word]]

# print(len(words))

In [3]:
tests = ['Sing About Me Im Dying Of Thirst Explicit', 'For Free Interlude', 'The Blacker The Berry', 'King Kunta', 'Hood Politics']
lengths = [12, 2.1666, 5.5, 4, 5]
for i,test in enumerate(tests):
    test_id = song2idx[test]
    ls = [entry[1] for entry in song_words[test_id]]
    print(sum(ls) / len(ls), sum(ls) / len(ls) / lengths[i])
    
"""
With division by average frequency of each word in the song
0.7232016032635682 0.060266800271964016
1.1597967578578583 0.5353072823123135
1.295883770505717 0.23561523100103943
0.7244399269128398 0.18110998172820994
0.8766220082344135 0.1753244016468827
"""
"""
Without 
1.3540493672105716 0.11283744726754763
1.6706596154857252 0.7710973947594043
2.265323537754268 0.4118770068644124
1.5919221323880868 0.3979805330970217
1.4771425965524765 0.2954285193104953
"""

1.3540493672105716 0.11283744726754763
1.6706596154857252 0.7710973947594043
2.265323537754268 0.4118770068644124
1.5919221323880868 0.3979805330970217
1.4771425965524765 0.2954285193104953


'\nWithout \n1.3540493672105716 0.11283744726754763\n1.6706596154857252 0.7710973947594043\n2.265323537754268 0.4118770068644124\n1.5919221323880868 0.3979805330970217\n1.4771425965524765 0.2954285193104953\n'

In [12]:
"""with open('artists.txt', 'r') as artists:
    for artist_name in artists:
        artist_name = artist_name.rstrip()
        print(artist_name)
        artist = Artist(artist_name)
        print(len(artist.songs))"""

tensor_embeddings = torch.cat([torch.unsqueeze(word_emb_dict[word],0) for word in words], 0) # M: len(words_train) by N: embedding length 
np_embeddings = tensor_embeddings.cpu().numpy()

train_indices = random.sample(range(len(words)), int(len(words) * 0.8))
test_indices = list(set(range(len(words))) - set(train_indices))

print(len(train_indices))
print(len(test_indices))

np_embeddings_train = torch.index_select(tensor_embeddings, 0, torch.tensor(train_indices).cuda()).cpu().numpy()
np_embeddings_test = torch.index_select(tensor_embeddings, 0, torch.tensor(test_indices).cuda()).cpu().numpy()

6566
1642


In [13]:
aic = []
score = []
clusters = [4, 8, 12, 16, 20]

for K in clusters:
  gm = GaussianMixture(n_components=K, random_state=0).fit(np_embeddings_train)
  aic.append(gm.aic(np_embeddings_test))
  score.append(gm.score(np_embeddings_test))
  print(K)

print("aic =", aic)
print("score =", score)

"""
    clusters = [10, 30, 50, 80, 100, 150, 200]
    aic = [-416948.73002689006, 1874017.8289749706, 87632445.04233673, 340288144.228958, 435794698.2793229, 530364033.43955195, 591455053.0331199]
    score = [443.1706985383991, 285.10700903242963, -27769.835241422705, -111302.13443481216, -142615.14046768815, -172703.45435813902, -191602.4916554545]
    
    clusters = [4, 8, 12, 16, 20]
    aic = [-1120051.801835596, -664663.3885352733, -159108.9880303305, 575654.1920049496, 1211183.7582672983]
    score = [495.87493376858157, 465.1996619436074, 417.75768316521743, 293.7084919769554, 202.82561555237356]
    
    clusters = [1, 2, 3, 4, 5]
    aic = [-1372253.9477600188, -1303254.5138943293, -1215141.8425588657, -1120051.801835596, -1018620.5975386647]
    score = [489.02204136364264, 496.3424177454309, 497.2746799996209, 495.87493376858157, 492.3558146853826]
   
"""

"""F"""

4
8
12
16
20
aic = [-1274290.1788476622, -846039.6055416595, -326186.0471678823, 321565.92780026444, 846240.6611338152]
score = [498.7503589670104, 479.0662623452069, 431.48844310836853, 344.96469920820203, 295.91879989835104]


'F'

In [6]:
# print(song_words[178])

print(list(enumerate(songs)))

[(0, '67 Freestyle'), (1, 'A Tale Of 2 Citiez Remix'), (2, 'Adhd'), (3, 'All Day'), (4, 'All The Stars'), (5, 'All The Stars Og'), (6, 'Alright'), (7, 'Alright Og'), (8, 'Btch Dont Kill My Vibe'), (9, 'Backseat Freestyle'), (10, 'Backseat Freestyle Explicit'), (11, 'Backwards'), (12, 'Bad Blood Remix'), (13, 'Bet Backroom Freestyle'), (14, 'Bet Cypher'), (15, 'Beyonce'), (16, 'Big Shot'), (17, 'Biggie Freestyle'), (18, 'Bitch Dont Kill My Vibe Remix'), (19, 'Bitch Dont Kill My Vibe'), (20, 'Bitch Dont Kill My Vibe Remix Explicit'), (21, 'Bitch Dont Kill My Vibe Explicit'), (22, 'Bitch Donat Kill My Vibe'), (23, 'Black Boy Fly'), (24, 'Black Boy Fly Explicit'), (25, 'Black Friday'), (26, 'Black Friday A Tale Of 2 Citiez Remix'), (27, 'Black Panther'), (28, 'Blood'), (29, 'Blood Explicit'), (30, 'Blue Faces'), (31, 'Cartoon Cereal'), (32, 'Chapter Six'), (33, 'Cloud 10'), (34, 'Collard Greens'), (35, 'Collect Calls'), (36, 'Complexion'), (37, 'Complexion A Zulu Love'), (38, 'Compton'), (

In [14]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
K = 39

gm = GaussianMixture(n_components=K, random_state=0).fit(np_embeddings)
classes = torch.tensor(gm.predict(np_embeddings), device="cuda") # (M,)
probs = torch.tensor(gm.predict_proba(np_embeddings), device="cuda") # (M, K)


groups = [[] for _ in range(K)]
for i,group in enumerate(classes.tolist()):
    groups[group].append(words[i])

pp.pprint(groups)

[   [   'rectum',
        'ovaries',
        'diseases',
        'semen',
        'animal',
        'inside',
        'dna',
        'endorphins',
        'pills',
        'vicodin',
        'marijuana',
        'cough',
        'adhd',
        'babies',
        'freebase',
        'painkillers',
        'scrotum',
        'remedy',
        'adderall',
        'cocaine',
        'drug',
        'spinal',
        'amphetamines',
        'yolk',
        'adrenaline',
        'chromosome',
        'birthing',
        'narcotics',
        'detox',
        'infants',
        'metabolic',
        'pill',
        'jugular',
        'pelvis',
        'stomach',
        'arthritis',
        'condom',
        'rabies',
        'dental',
        'gums',
        'nerves',
        'poison',
        'hormones',
        'cancer',
        'actin',
        'breathing',
        'potion',
        'swallowing',
        'anus',
        'nicotine',
        'tylenol',
        'addiction',
        'sucking',


        'jimi',
        'hoochie',
        'coochie',
        'whoa',
        'kunta',
        'burnin',
        'nah',
        'uh',
        'trippin',
        'woah',
        'imma',
        'dro',
        'parfait',
        'gettin',
        'lookin',
        'djs',
        'talkin',
        'nothin',
        'comin',
        'fo',
        'sho',
        'guitar',
        'lovin',
        'screamin',
        'hangin',
        'tryin',
        'ima',
        'freestyles',
        'ive',
        'oj',
        'maad',
        'jokes',
        'killa',
        'jordin',
        'moms',
        'ballin',
        'sya',
        'beyonce',
        'ay',
        'thinkin',
        'wyclef',
        'alot',
        'latoya',
        'doo',
        'sayin',
        'takin',
        'shoulda',
        'nas',
        'blige',
        'dat',
        'diddy',
        'haters',
        'mmm',
        'iggy',
        'js',
        'ye',
        'hes',
        'commercials',
        'lotta',
       

        'lid',
        'furniture',
        'velvet',
        'boots',
        'robes',
        'dressing',
        'coated',
        'tooth',
        'pins',
        'skull',
        'cupboard',
        'feather',
        'stretchy',
        'wallpaper',
        'cloth',
        'armor',
        'feathers',
        'hairdo',
        'knives',
        'appliances',
        'seal',
        'curling',
        'keyhole',
        'stitched',
        'pigskin',
        'dentures',
        'polyethylene',
        'gloves',
        'fingernails',
        'jaws',
        'pane',
        'seams'],
    [   'rap',
        'jaw',
        'hit',
        'strip',
        'rock',
        'stumble',
        'pack',
        'spike',
        'beat',
        'bore',
        'scrape',
        'breeze',
        'grip',
        'shoot',
        'stick',
        'rush',
        'stomp',
        'crush',
        'thrust',
        'clip',
        'ding',
        'ring',
        'shot',
        'shake',
       

        'sick',
        'ridiculous',
        'haunted',
        'sentimental',
        'crazy',
        'corrupted',
        'false',
        'complicated',
        'afraid',
        'misusing',
        'scared',
        'hating',
        'embarrassed',
        'supposed',
        'mad',
        'annoyed',
        'crackers',
        'sensitive',
        'nuts',
        'irrational',
        'jealous',
        'bogus',
        'emotional',
        'shocked',
        'lying',
        'alarmed',
        'facetious',
        'terrified',
        'stricken',
        'hysterical',
        'warm',
        'funny',
        'maniac',
        'nervous',
        'unfaithful',
        'selfish',
        'mistaken',
        'nutty',
        'rocky',
        'hooked',
        'babysitting',
        'hungry',
        'thirsty',
        'joking',
        'crooked',
        'allergic',
        'questioning',
        'enthused',
        'wondering',
        'tender',
        'accused',
        'bats',

In [15]:
# Obtain class matrices from tensors


test_id = song2idx['The Blacker The Berry']
print(test_id)
song_class_matrices = []
for id,song in enumerate(songs):
    S = len(song_words[id])
    count = []
    indices = []
    for tup in song_words[id]:
      indices.append(word2idx[tup[0]])
      count.append(tup[1])

    count = torch.tensor(count, device="cuda")
    indices = torch.tensor(indices, device="cuda")
    lyrics_to_cluster_probs = torch.index_select(probs, 0, indices)
    lyrics_to_embs = torch.index_select(tensor_embeddings, 0, indices)
    class_matrix = torch.mm(torch.transpose(lyrics_to_cluster_probs, 0, 1).double(), (lyrics_to_embs * count.view(-1, 1).double()))
    
    song_class_matrices.append(class_matrix)
    
# song_class_matrices = torch.stack(song_class_matrices, 0)
# print(song_class_matrices.size())

# K: # of clustered classes
# N: # of dims in embedding
# S: # of words in the song's bag-of-words
# count: S x 1 # no of occurences for each word
# lyrics_to_cluster_probs: S x K  Produce by filtering the rows of probs down to only the words in the song
# lyrics_to_embs: S x N  Produce by filtering rows of tensor_embeddings down to only words in the song
# Then the desired K x N class matrix should be = lyrics_to_cluster_probs.T @ (lyrics_to_embs * count)

167


In [16]:
def matrix_similarity(A, B):
    diff = torch.sub(A, B)
    # abs_diff = torch.abs(diff)
    # return torch.sum(abs_diff).item()
    return torch.norm(diff).item()

norms = [(songs[i], matrix_similarity(song_class_matrices[test_id], matrix)) for i,matrix in enumerate(song_class_matrices)]
norms_sort = sorted(norms, key=lambda x: x[1])
pp.pprint(norms_sort)

[   ('The Blacker The Berry --- kendrick-lamar', 0.0),
    ('Blue Faces --- kendrick-lamar', 76.92169462813334),
    ('Untitled 2 Blue Faces --- kendrick-lamar', 76.92559972366277),
    ('Untitled --- kendrick-lamar', 77.43227208624205),
    ('Untitled 2 --- kendrick-lamar', 77.43227208624205),
    ('The Heart Pt 1 --- kendrick-lamar', 78.26580091272989),
    ('The Heart Pt 2 --- kendrick-lamar', 78.26580091272989),
    ('The Heart Pt 3 --- kendrick-lamar', 78.26580091272989),
    (   'The Heart Pt 3 Will You Let It Die --- kendrick-lamar',
        78.26580091272989),
    ('Hiiiipower --- kendrick-lamar', 78.37711109373573),
    ('Hiiipower --- kendrick-lamar', 78.43868703318167),
    ('I Am --- kendrick-lamar', 78.5116231030984),
    ('Black Friday --- kendrick-lamar', 79.08322864597251),
    (   'Black Friday A Tale Of 2 Citiez Remix --- kendrick-lamar',
        79.08322864597251),
    ('Real --- kendrick-lamar', 79.32932292454609),
    ('Real Explicit --- kendrick-lamar', 79.3293229

In [32]:
for i,song in enumerate(songs):
    sort = sorted(song_words[i], key=lambda x: x[1], reverse=True)
    sort = [(tup[0], round(tup[1], 2)) for tup in sort]
    print(song + ":", sort[:3])
    #pp.pprint(sort[-10:])

67 Freestyle --- kendrick-lamar: [('monitors', 10.31), ('tracy', 10.31), ('veronica', 10.31)]
A Tale Of 2 Citiez Remix --- kendrick-lamar: [('cobra', 10.31), ('valedictorian', 5.16), ('radius', 5.16)]
Adhd --- kendrick-lamar: [('vicodin', 10.31), ('loner', 10.31), ('cough', 10.31)]
All Day --- kendrick-lamar: [('parfait', 10.31), ('jeter', 10.31), ('swish', 10.31)]
All The Stars --- kendrick-lamar: [('endorsing', 10.31), ('hoped', 5.16), ('confrontation', 5.16)]
All The Stars Og --- kendrick-lamar: [('hoped', 5.16), ('confrontation', 5.16), ('morgue', 5.16)]
Alright --- kendrick-lamar: [('twilight', 5.16), ('preliminary', 5.16), ('hearings', 5.16)]
Alright Og --- kendrick-lamar: [('twilight', 5.16), ('preliminary', 5.16), ('hearings', 5.16)]
Btch Dont Kill My Vibe --- kendrick-lamar: [('painless', 1.72), ('awoke', 1.72), ('scar', 1.72)]
Backseat Freestyle --- kendrick-lamar: [('vacation', 5.16), ('maserati', 5.16), ('judas', 5.16)]
Backseat Freestyle Explicit --- kendrick-lamar: [('vac