## Goal of notebook 
- Create word model for pop and rock
- Compute embeddings for words in pop and rock
- Apply MUSE matching

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
filepath = "lyrics_final_clean.csv"
lyrics_df = pd.read_csv(filepath)
lyrics_df = lyrics_df.dropna(axis = 0, how='any', subset=['lyrics'])
lyrics_df.head()

Unnamed: 0,artist,genre,lyrics,song
0,beyonce-knowles,pop,Oh baby how you doing\nYou know I'm gonna cut ...,ego-remix
1,beyonce-knowles,pop,playin' everything so easy\nit's like you seem...,then-tell-me
2,beyonce-knowles,pop,If you search\nFor tenderness\nIt isn't hard t...,honesty
3,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,you-are-my-rock
4,beyonce-knowles,pop,Party the people the people the party it's pop...,black-culture


## Word Embeddings (Fasttext)

In [195]:
#Extrat specific genre lyrics
def extract_genre_lyrics(g):
    f = open("lyrics_"+g+".txt", "w+")
    for i, r in lyrics_df[lyrics_df['genre'] == g].iterrows():
        text = r['lyrics']+ '\n'
        text = text.lower()
        text = text.replace("'", ' ')
        f.write(text)

    f.close()

In [196]:
extract_genre_lyrics('pop')

In [197]:
extract_genre_lyrics('rock')

In [198]:
# Generate embeddings pop
! ./fasttext cbow -input lyrics_pop.txt -output model_lyrics_pop

Read 16M words
Number of words:  29549
Number of labels: 0


Progress: 48.0%  words/sec/thread: 285192  lr: 0.026017  loss: 1.994570  eta: 0h0m   words/sec/thread: 12029  lr: 0.049895  loss: 4.074791  eta: 0h9m   lr: 0.049740  loss: 3.411579  eta: 0h4m   lr: 0.049619  loss: 3.076413  eta: 0h2m   loss: 2.893054  eta: 0h2m m   eta: 0h1m hread: 110384  lr: 0.048597  loss: 2.544579  eta: 0h1m /thread: 112257  lr: 0.048561  loss: 2.536121  eta: 0h1m hread: 114529  lr: 0.048517  loss: 2.522033  eta: 0h0m 2.460153  eta: 0h0m 3.7%  words/sec/thread: 131223  lr: 0.048157  loss: 2.455158  eta: 0h0m %  words/sec/thread: 136387  lr: 0.048032  loss: 2.434868  eta: 0h0m hread: 137878  lr: 0.047993  loss: 2.431771  eta: 0h0m hread: 147843  lr: 0.047732  loss: 2.391124  eta: 0h0m hread: 149125  lr: 0.047695  loss: 2.383582  eta: 0h0m hread: 160388  lr: 0.047370  loss: 2.348550  eta: 0h0m hread: 163663  lr: 0.047266  loss: 2.333563  eta: 0h0m ec/thread: 174601  lr: 0.046859  loss: 2.290448  eta: 0h0m ec/thread: 180164  lr: 0.046634  loss: 2.269885  eta: 0h0m h0m

Progress: 72.9%  words/sec/thread: 287960  lr: 0.013561  loss: 1.945256  eta: 0h0m   words/sec/thread: 285260  lr: 0.025892  loss: 1.993766  eta: 0h0m   lr: 0.025831  loss: 1.993454  eta: 0h0m ad: 285247  lr: 0.025769  loss: 1.993389  eta: 0h0m 0h0m   eta: 0h0m   words/sec/thread: 285514  lr: 0.025478  loss: 1.991567  eta: 0h0m   words/sec/thread: 285638  lr: 0.025331  loss: 1.990626  eta: 0h0m h0m %  words/sec/thread: 285694  lr: 0.025261  loss: 1.990116  eta: 0h0m   words/sec/thread: 285757  lr: 0.025222  loss: 1.990181  eta: 0h0m /thread: 285780  lr: 0.025193  loss: 1.990182  eta: 0h0m ead: 285843  lr: 0.025152  loss: 1.990350  eta: 0h0m   words/sec/thread: 285854  lr: 0.025120  loss: 1.990456  eta: 0h0m h0m   words/sec/thread: 285940  lr: 0.024995  loss: 1.989883  eta: 0h0m loss: 1.989595  eta: 0h0m   words/sec/thread: 286000  lr: 0.024913  loss: 1.989151  eta: 0h0m   words/sec/thread: 285996  lr: 0.024879  loss: 1.989102  eta: 0h0m   words/sec/thread: 286029  lr: 0.024782  loss: 1

Progress: 93.2%  words/sec/thread: 287667  lr: 0.003399  loss: 1.901590  eta: 0h0m loss: 1.944730  eta: 0h0m 4631  eta: 0h0m 0.013181  loss: 1.944548  eta: 0h0m 0.013108  loss: 1.944481  eta: 0h0m ad: 288036  lr: 0.013073  loss: 1.944258  eta: 0h0m thread: 288033  lr: 0.013037  loss: 1.943786  eta: 0h0m   words/sec/thread: 288037  lr: 0.013027  loss: 1.943631  eta: 0h0m ad: 288035  lr: 0.012957  loss: 1.943119  eta: 0h0m 0.012882  loss: 1.943012  eta: 0h0m   words/sec/thread: 288041  lr: 0.012874  loss: 1.943024  eta: 0h0m   words/sec/thread: 288050  lr: 0.012792  loss: 1.942820  eta: 0h0m %  words/sec/thread: 288057  lr: 0.012764  loss: 1.942747  eta: 0h0m ad: 288080  lr: 0.012716  loss: 1.942714  eta: 0h0m   words/sec/thread: 288097  lr: 0.012632  loss: 1.942446  eta: 0h0m ad: 288081  lr: 0.012584  loss: 1.941552  eta: 0h0m   words/sec/thread: 288081  lr: 0.012505  loss: 1.941574  eta: 0h0m   words/sec/thread: 288087  lr: 0.012377  loss: 1.941373  eta: 0h0m ad: 288081  lr: 0.012361  

0.003398  loss: 1.901586  eta: 0h0m Progress: 93.2%  words/sec/thread: 287663  lr: 0.003395  loss: 1.901584  eta: 0h0m Progress: 93.2%  words/sec/thread: 287663  lr: 0.003395  loss: 1.901587  eta: 0h0m Progress: 93.2%  words/sec/thread: 287663  lr: 0.003395  loss: 1.901589  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901587  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901586  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901582  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901580  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901584  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003394  loss: 1.901580  eta: 0h0m Progress: 93.2%  words/sec/thread: 287662  lr: 0.003393  loss: 1.901572  eta: 0h0m Progress: 93.2%  words/sec/thread: 287660  lr: 0.003393  loss: 1.901572  eta: 0h0m Progress: 93.2%  words/sec/thread: 2876

Progress: 100.0%  words/sec/thread: 287569  lr: 0.000000  loss: 1.895366  eta: 0h0m  words/sec/thread: 287579  lr: 0.003264  loss: 1.901560  eta: 0h0m 0.003168  loss: 1.901727  eta: 0h0m   words/sec/thread: 287498  lr: 0.003106  loss: 1.901904  eta: 0h0m   words/sec/thread: 287489  lr: 0.003038  loss: 1.901999  eta: 0h0m ad: 287430  lr: 0.002932  loss: 1.901759  eta: 0h0m thread: 287413  lr: 0.002876  loss: 1.901792  eta: 0h0m ad: 287392  lr: 0.002859  loss: 1.901695  eta: 0h0m ad: 287392  lr: 0.002792  loss: 1.901673  eta: 0h0m   words/sec/thread: 287373  lr: 0.002785  loss: 1.901708  eta: 0h0m ad: 287337  lr: 0.002721  loss: 1.901892  eta: 0h0m ad: 287300  lr: 0.002592  loss: 1.901672  eta: 0h0m 1.901633  eta: 0h0m   words/sec/thread: 287283  lr: 0.002514  loss: 1.901352  eta: 0h0m thread: 287268  lr: 0.002436  loss: 1.901234  eta: 0h0m 0.002392  loss: 1.901105  eta: 0h0m   words/sec/thread: 287266  lr: 0.002367  loss: 1.900847  eta: 0h0m   words/sec/thread: 287267  lr: 0.002288  los

In [199]:
# Generate embeddings rock
! ./fasttext cbow -input lyrics_rock.txt -output model_lyrics_rock

Read 22M words
Number of words:  33997
Number of labels: 0


Progress: 40.4%  words/sec/thread: 300284  lr: 0.029815  loss: 2.069616  eta: 0h0m m   lr: 0.049699  loss: 3.160851  eta: 0h3m   loss: 2.906526  eta: 0h2m 0.9%  words/sec/thread: 60493  lr: 0.049527  loss: 2.845055  eta: 0h2m   loss: 2.760958  eta: 0h1m   lr: 0.049282  loss: 2.750509  eta: 0h1m hread: 120324  lr: 0.048799  loss: 2.589778  eta: 0h1m hread: 127099  lr: 0.048687  loss: 2.558353  eta: 0h1m hread: 129066  lr: 0.048655  loss: 2.548236  eta: 0h1m  loss: 2.503049  eta: 0h1m hread: 139799  lr: 0.048459  loss: 2.495817  eta: 0h1m 89  eta: 0h1m %  words/sec/thread: 146902  lr: 0.048309  loss: 2.460682  eta: 0h1m hread: 154832  lr: 0.048136  loss: 2.440478  eta: 0h0m   lr: 0.048069  loss: 2.432017  eta: 0h0m hread: 160221  lr: 0.048012  loss: 2.418918  eta: 0h0m 2.393095  eta: 0h0m 0h0m 181636  lr: 0.047428  loss: 2.355186  eta: 0h0m hread: 183594  lr: 0.047365  loss: 2.352024  eta: 0h0m hread: 184317  lr: 0.047339  loss: 2.350730  eta: 0h0m 0.047167  loss: 2.344465  eta: 0h0m 0m 

Progress: 67.0%  words/sec/thread: 309370  lr: 0.016494  loss: 1.991949  eta: 0h0m   words/sec/thread: 300443  lr: 0.029719  loss: 2.069431  eta: 0h0m   words/sec/thread: 300532  lr: 0.029680  loss: 2.069059  eta: 0h0m   words/sec/thread: 300590  lr: 0.029651  loss: 2.068648  eta: 0h0m   loss: 2.067190  eta: 0h0m 5440  eta: 0h0m   words/sec/thread: 300982  lr: 0.029316  loss: 2.065063  eta: 0h0m h0m 2.064018  eta: 0h0m ad: 301349  lr: 0.028697  loss: 2.059445  eta: 0h0m 0.028641  loss: 2.058813  eta: 0h0m   words/sec/thread: 301410  lr: 0.028618  loss: 2.058542  eta: 0h0m   words/sec/thread: 301415  lr: 0.028588  loss: 2.058530  eta: 0h0m 0.028139  loss: 2.057700  eta: 0h0m 302167  lr: 0.028083  loss: 2.057727  eta: 0h0m ad: 302213  lr: 0.027977  loss: 2.056631  eta: 0h0m 44.1%  words/sec/thread: 302247  lr: 0.027931  loss: 2.056175  eta: 0h0m 7896  loss: 2.055769  eta: 0h0m ess: 44.8%  words/sec/thread: 302376  lr: 0.027599  loss: 2.053337  eta: 0h0m s/sec/thread: 302422  lr: 0.027566

Progress: 90.2%  words/sec/thread: 311205  lr: 0.004900  loss: 1.951695  eta: 0h0m   words/sec/thread: 309434  lr: 0.016413  loss: 1.991734  eta: 0h0m ad: 309442  lr: 0.016377  loss: 1.991649  eta: 0h0m   words/sec/thread: 309573  lr: 0.016248  loss: 1.991548  eta: 0h0m 0.016223  loss: 1.991288  eta: 0h0m 1.990965  eta: 0h0m ad: 309717  lr: 0.016131  loss: 1.990813  eta: 0h0m   words/sec/thread: 309754  lr: 0.016087  loss: 1.990520  eta: 0h0m 1.990336  eta: 0h0m 0.015994  loss: 1.990606  eta: 0h0m   words/sec/thread: 309883  lr: 0.015889  loss: 1.990425  eta: 0h0m   words/sec/thread: 309888  lr: 0.015826  loss: 1.990164  eta: 0h0m   words/sec/thread: 309852  lr: 0.015713  loss: 1.989577  eta: 0h0m ad: 309831  lr: 0.015618  loss: 1.989069  eta: 0h0m   words/sec/thread: 309751  lr: 0.015472  loss: 1.988670  eta: 0h0m   words/sec/thread: 309721  lr: 0.015447  loss: 1.988567  eta: 0h0m ad: 309675  lr: 0.015338  loss: 1.987903  eta: 0h0m   words/sec/thread: 309600  lr: 0.015005  loss: 1.986

Progress: 100.0%  words/sec/thread: 312334  lr: 0.000000  loss: 1.939480  eta: 0h0m  words/sec/thread: 311261  lr: 0.004838  loss: 1.951455  eta: 0h0m   words/sec/thread: 311268  lr: 0.004832  loss: 1.951380  eta: 0h0m thread: 311326  lr: 0.004764  loss: 1.951257  eta: 0h0m ad: 311328  lr: 0.004709  loss: 1.950978  eta: 0h0m   words/sec/thread: 311347  lr: 0.004595  loss: 1.950639  eta: 0h0m  1.950390  eta: 0h0m 1.4%  words/sec/thread: 311424  lr: 0.004284  loss: 1.949726  eta: 0h0m thread: 311425  lr: 0.004264  loss: 1.949452  eta: 0h0m ad: 311464  lr: 0.004197  loss: 1.949258  eta: 0h0m ad: 311484  lr: 0.004140  loss: 1.949018  eta: 0h0m   words/sec/thread: 311486  lr: 0.004133  loss: 1.948943  eta: 0h0m thread: 311515  lr: 0.004071  loss: 1.948986  eta: 0h0m   words/sec/thread: 311522  lr: 0.004016  loss: 1.948842  eta: 0h0m   words/sec/thread: 311524  lr: 0.004007  loss: 1.948870  eta: 0h0m   words/sec/thread: 311552  lr: 0.003937  loss: 1.948857  eta: 0h0m ad: 311597  lr: 0.003879

In [3]:
import sys
import codecs

In [4]:
# Load embeddings
def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary

In [200]:
word_embeddings_pop, vocabulary_pop = load_embeddings('model_lyrics_pop.vec')

In [201]:
word_embeddings_rock, vocabulary_rock = load_embeddings('model_lyrics_rock.vec')

In [202]:
voc_pop = list(vocabulary_pop)
voc_rock = list(vocabulary_rock)

In [203]:
len(voc_rock)

33997

In [204]:
print(len(voc_pop))

29549


In [205]:
common_words = set(voc_pop).intersection(set(voc_rock))
print(len(common_words))
common_words = list(common_words)
common_words[:10]

21951


['29',
 'designer',
 'late',
 'clicks',
 'nursed',
 'desmond',
 '1998',
 'mangled',
 'lecture',
 'greg']

In [206]:
import re

In [209]:
for w in voc_rock:
    if re.match('[a-z]+[A-Z]+[a-z]+',w):
        print(w)

In [210]:
#Percentage of common words 
print(len(common_words)/len(voc_pop))
print(len(common_words)/len(voc_rock))

0.7428677789434499
0.6456746183486778


## TF-IDF
Source : https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [211]:
corpus_pop = list(lyrics_df[lyrics_df['genre'] == 'pop']['lyrics'])
corpus_pop = [x.replace('\n', ' ').replace("'", ' ').lower() for x in corpus_pop]

In [212]:
corpus_rock = list(lyrics_df[lyrics_df['genre'] == 'rock']['lyrics'])
corpus_rock = [x.replace('\n', ' ').replace("'", ' ').lower() for x in corpus_rock]

In [213]:
def get_tfidf(corpus, max_freq, min_freq):
    tf = TfidfVectorizer(stop_words='english', max_df=max_freq, min_df=min_freq)
    X = tf.fit(corpus)
    X_transformed = X.transform(corpus)
    # find maximum value for each of the features over all of dataset:
    max_val = X_transformed.max(axis=0).toarray().ravel()

    feature_names = np.array(tf.get_feature_names())
    sorted_by_idf = np.argsort(tf.idf_)

    #sort weights from smallest to biggest and extract their indices 
    sort_by_tfidf = max_val.argsort()
    return feature_names[sort_by_tfidf]

In [214]:
tfidf_pop = get_tfidf(corpus_pop, 1.0, 0.0)
print("Features with lowest tfidf:\n{}".format(tfidf_pop[:10]))

print("\nFeatures with highest tfidf: \n{}".format(tfidf_pop[-10:]))

Features with lowest tfidf:
['youven' 'snsd' 'naegenaege' 'gakka' 'soljikan' 'jeulgyeobwa' 'tteugeopke'
 'kkeuteopneun' '2pm' 'eumeul']

Features with highest tfidf: 
['intrumental' 'instrumental' 'stand' 'coming' 'turn' 'silhouette'
 'stingray' 'embed' 'lyrics' 'hey']


In [215]:
tfidf_rock = get_tfidf(corpus_rock, 1.0, 0.0)
print("Features with lowest tfidf:\n{}".format(tfidf_rock[:10]))

print("\nFeatures with highest tfidf: \n{}".format(tfidf_rock[-10:]))

Features with lowest tfidf:
['heighho' 'aalley' 'greivin' 'chinchara' 'saftey' 'yayara' 'yeshot'
 'acelandine' 'hivoltage' 'scoundel']

Features with highest tfidf: 
['work' 'paid' 'awolnation' 'raining' 'comes' 'oohooh' 'fm' 'ground' 'gun'
 'hey']


## MUSE
- Supervised: using a train bilingual dictionary (or identical character strings as anchor points), learn a mapping from the source to the target space using (iterative) Procrustes alignment.
- Unsupervised: without any parallel data or anchor point, learn a mapping from the source to the target space using adversarial training and (iterative) Procrustes refinement.

In [216]:
N_EPOCH = 5
BATCH_SIZE = 32
N_ITERATION = round(len(voc_rock)/BATCH_SIZE)
REFINEMENT = 5

In [217]:
! python MUSE-master/unsupervised.py --src_lang rock --tgt_lang pop --src_emb model_lyrics_rock.vec --tgt_emb model_lyrics_pop.vec --n_epochs $N_EPOCH --epoch_size $N_ITERATION --batch_size $BATCH_SIZE --n_refinement $REFINEMENT

Impossible to import Faiss-GPU. Switching to FAISS-CPU, this will be slower.

INFO - 11/28/18 17:40:46 - 0:00:00 - adversarial: True
                                     batch_size: 32
                                     cuda: False
                                     dico_build: S2T
                                     dico_eval: default
                                     dico_max_rank: 15000
                                     dico_max_size: 0
                                     dico_method: csls_knn_10
                                     dico_min_size: 0
                                     dico_threshold: 0
                                     dis_clip_weights: 0
                                     dis_dropout: 0.0
                                     dis_hid_dim: 2048
                                     dis_input_dropout: 0.1
                                     dis_lambda: 1
                                     dis_layers: 2
                                     dis_most_

INFO - 11/28/18 17:45:07 - 0:04:21 - 000000 - Discriminator loss: 0.4655 - 211 samples/s
INFO - 11/28/18 17:45:28 - 0:04:42 - Building the train dictionary ...
INFO - 11/28/18 17:45:28 - 0:04:42 - New train dictionary of 4733 pairs.
INFO - 11/28/18 17:45:28 - 0:04:42 - Mean cosine (nn method, S2T build, 10000 max size): 0.54008
INFO - 11/28/18 17:46:08 - 0:05:22 - Building the train dictionary ...
INFO - 11/28/18 17:46:08 - 0:05:22 - New train dictionary of 4451 pairs.
INFO - 11/28/18 17:46:08 - 0:05:22 - Mean cosine (csls_knn_10 method, S2T build, 10000 max size): 0.51886
INFO - 11/28/18 17:46:08 - 0:05:22 - __log__:{"n_epoch": 4, "mean_cosine-nn-S2T-10000": 0.5400819420348558, "mean_cosine-csls_knn_10-S2T-10000": 0.5188626367954479}
INFO - 11/28/18 17:46:08 - 0:05:22 - * Best value for "mean_cosine-csls_knn_10-S2T-10000": 0.51886
INFO - 11/28/18 17:46:08 - 0:05:22 - * Saving the mapping to /Users/emma/Cours/Sem_3/Lyrix/MUSE/MUSE-master/dumped/debug/va2dinkpoy/best_mapping.pth ...
INF

INFO - 11/28/18 17:53:49 - 0:13:04 - Map source embeddings to the target space ...
INFO - 11/28/18 17:53:49 - 0:13:04 - Writing source embeddings to /Users/emma/Cours/Sem_3/Lyrix/MUSE/MUSE-master/dumped/debug/va2dinkpoy/vectors-rock.txt ...
INFO - 11/28/18 17:53:55 - 0:13:10 - Writing target embeddings to /Users/emma/Cours/Sem_3/Lyrix/MUSE/MUSE-master/dumped/debug/va2dinkpoy/vectors-pop.txt ...


## Project

In [27]:
import random

In [235]:
rand_idx = random.randint(1,len(corpus_rock))
input_lyrics = corpus_rock[rand_idx]
input_lyrics

'swollen eyes that bleed for you cold steel bars i m watching thru you ve been baptized in a lake of tears crucified yourself with your own fears but you learn from what s killing you and this time it s real beyond your prayers too numb to feel beyond your prayers deepest darkest thoughts you dream curing s harder than it seems slave to no one but your misery broken man lies where you used to be'

In [236]:
words_in_tfidf = []
idx_of_tfidf = []
for w in input_lyrics.split(' '):
    idx = np.where(tfidf_rock==w)[0]
    if len(idx)!= 0:
        words_in_tfidf.append(w)
        idx_of_tfidf.append(idx[0])

In [237]:
ordered_terms = np.array(words_in_tfidf)[np.argsort(idx_of_tfidf)]

In [238]:
WORDS_TO_SWAP = ordered_terms[:10]

In [239]:
WORDS_TO_SWAP

array(['curing', 'swollen', 'fears', 'prayers', 'prayers', 'thoughts',
       'darkest', 'baptized', 'lake', 'crucified'],
      dtype='<U9')

In [218]:
muse_emb_rock = "vectors-rock.txt"
muse_emb_pop = "vectors-pop.txt"

In [219]:
muse_emb_rock, muse_voc_rock = load_embeddings(muse_emb_rock)

In [220]:
muse_emb_pop, muse_voc_pop = load_embeddings(muse_emb_pop)

In [36]:
def get_dict(embed, voc):
    voc_embeds_dict = {}
    embeds_voc_dict = {}

    for v, emb in zip(voc, embed):
        voc_embeds_dict[v] = tuple(emb)
        embeds_voc_dict[tuple(emb)] = v
    return voc_embeds_dict, embeds_voc_dict

In [221]:
voc2embed_rock, embed2voc_rock = get_dict(word_embeddings_rock, vocabulary_rock)
voc2embed_pop, embed2voc_pop = get_dict(word_embeddings_pop, vocabulary_pop)

In [108]:
def see_movement(w):
    print("Movement in rock : ")
    print(sum(np.array(muse_voc2embed_rock[w]) - voc2embed_rock[w]))
    print("Distance in fastext between 2 words : ")
    print(sum(np.array(voc2embed_rock[w])- voc2embed_pop[w]))
    print("Distance in muse between 2 words : ")
    print(sum(np.array(muse_voc2embed_rock[w])- muse_voc2embed_pop[w]))

In [241]:
muse_voc2embed_rock, muse_embed2voc_rock = get_dict(muse_emb_rock, muse_voc_rock) 

In [242]:
muse_voc2embed_pop, muse_embed2voc_pop = get_dict(muse_emb_pop, muse_voc_pop) 

In [229]:
see_movement('music')

Movement in rock : 
8.153664
Distance in fastext between 2 words : 
-11.97545775
Distance in muse between 2 words : 
-6.58084


In [41]:
from sklearn.neighbors import NearestNeighbors

In [230]:
neigh_pop = NearestNeighbors(n_neighbors=3)
neigh_pop.fit(muse_emb_pop)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [231]:
neigh_rock = NearestNeighbors(n_neighbors=3)
neigh_rock.fit(muse_emb_rock)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [232]:
def get_nearest_embed(emd, genre):
    if genre == 'pop':
        idx = neigh_pop.kneighbors([emb],return_distance=False)
        return muse_emb_pop[idx][0]
    elif genre == 'rock':
        idx = neigh_rock.kneighbors([emb],return_distance=False)
        return muse_emb_rock[idx]

In [243]:
swap = {}
for w in WORDS_TO_SWAP:
    #Read original embedding
    """idx = vocabulary_rock.index(w)
    emb = word_embeddings_rock[idx]
    print(w, emb)"""
    #Read mapped embedding
    emb = muse_voc2embed_rock[w]
    nearest_pop_emb = get_nearest_embed(emb, 'pop')
    words = []
    for i in range(3):
        words.append(muse_embed2voc_pop[tuple(list(nearest_pop_emb[i]))])
    swap[w] = words
    print(w,' --> ',swap[w])

curing  -->  ['gossiping', 'f**king', 'hawking']
swollen  -->  ['murmuring', 'melbourne', 'whippoorwill']
fears  -->  ['sheltering', 'sheltered', 'enlightened']
prayers  -->  ['saddens', 'downfalls', 'willyes']
prayers  -->  ['saddens', 'downfalls', 'willyes']
thoughts  -->  ['imaginings', 'belongings', 'enlightened']
darkest  -->  ['meltdown', 'rundown', 'breakthrough']
baptized  -->  ['youhung', 'funktafied', 'hellafied']
lake  -->  ['sunk', 'roooll', 'floozy']
crucified  -->  ['funktafied', 'advertised', 'amplified']


In [246]:
#Print new lyrics
words = input_lyrics.split(' ')
for i, w in enumerate(words):
    if w in WORDS_TO_SWAP:
        words[i] = swap[w]

words

[['murmuring', 'melbourne', 'whippoorwill'],
 'eyes',
 'that',
 'bleed',
 'for',
 'you',
 'cold',
 'steel',
 'bars',
 'i',
 'm',
 'watching',
 'thru',
 'you',
 've',
 'been',
 ['youhung', 'funktafied', 'hellafied'],
 'in',
 'a',
 ['sunk', 'roooll', 'floozy'],
 'of',
 'tears',
 ['funktafied', 'advertised', 'amplified'],
 'yourself',
 'with',
 'your',
 'own',
 ['sheltering', 'sheltered', 'enlightened'],
 'but',
 'you',
 'learn',
 'from',
 'what',
 's',
 'killing',
 'you',
 'and',
 'this',
 'time',
 'it',
 's',
 'real',
 'beyond',
 'your',
 ['saddens', 'downfalls', 'willyes'],
 'too',
 'numb',
 'to',
 'feel',
 'beyond',
 'your',
 ['saddens', 'downfalls', 'willyes'],
 'deepest',
 ['meltdown', 'rundown', 'breakthrough'],
 ['imaginings', 'belongings', 'enlightened'],
 'you',
 'dream',
 ['gossiping', 'f**king', 'hawking'],
 's',
 'harder',
 'than',
 'it',
 'seems',
 'slave',
 'to',
 'no',
 'one',
 'but',
 'your',
 'misery',
 'broken',
 'man',
 'lies',
 'where',
 'you',
 'used',
 'to',
 'be']