## Helper notebook
The goal of this notebook is to provide a pipeline of functions to create all the necessary files to run the demo of the MUSE procedure with the given pair of genres. <br>

In order to use the following code, please follow those steps:
- Make sure you have all the necessary libraries installed so that the import cell works
- Choose your pair of genre in the following list ['pop', 'rock', 'hiphop', 'metal', 'jazz', 'country']
- Enter the first genre as the 'src' variable
- Enter the second genre as the 'trg' variable
- You can then run all the cells in order: you will have to update the path to the MUSE-master dump repository in the 'analyse_run' function
- Once this is done, you can launch the demo and explore the results

In [1]:
#Important imports
import numpy as np
import pandas as pd
import pickle
#MUSE Part
from sklearn.feature_extraction.text import TfidfVectorizer

import sys
import codecs

In [2]:
#Genre of the lyrics
src = "pop"

In [3]:
#Genre from which we want the replacement words
trg = "metal"

In [4]:
#Load the dataset to make the computations
filepath = "data/lyrics_final_clean.csv"
lyrics_df = pd.read_csv(filepath)
lyrics_df = lyrics_df.dropna(axis = 0, how='any', subset=['lyrics'])

In [5]:
#Function to load the embeddings from muse and fasttext
def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary

In [6]:
#Load pre-computed FastText embeddings from the 2 relevant genres
word_embeddings_src, vocabulary_src = load_embeddings('data/MUSE/model_lyrics_'+src+'.vec')
word_embeddings_trg, vocabulary_trg = load_embeddings('data/MUSE/model_lyrics_'+trg+'.vec')

In [7]:
#Extract list of vocabularies to compute specific vocabulary
voc_src = list(vocabulary_src)
voc_trg = list(vocabulary_trg)

In [8]:
#Identify the words present in both vocabularies
common_words = set(voc_src).intersection(set(voc_trg))
common_words = list(common_words)

In [9]:
#Extract the specific words
src_specific_from_trg = set(voc_src).difference(set(common_words))

In [90]:
#Save the specific vocabulary
pickle.dump(src_specific_from_trg, open("webpage/"+src+"/specific_from_"+trg+".py", "wb"))

In [10]:
#Extract all the corpus of source genre to compute the TF-IDF ranking
#(cannot use the .txt files as we need the separation by songs and .txt files is all songs concatenated)
corpus_src = list(lyrics_df[lyrics_df['genre'] == src]['lyrics'])
corpus_src = [x.replace('\n', ' ') for x in corpus_src]

In [11]:
#Function giving the TF-IDF ranking of the vocabulary of one genre
def get_tfidf(corpus, max_freq, min_freq):
    #We use the stop_words parameter to remove the current words from the computations
    tf = TfidfVectorizer(stop_words='english', max_df=max_freq, min_df=min_freq)
    X = tf.fit(corpus)
    X_transformed = X.transform(corpus)
    #Find maximum value for each of the features over all of dataset
    max_val = X_transformed.max(axis=0).toarray().ravel()

    feature_names = np.array(tf.get_feature_names())
    sorted_by_idf = np.argsort(tf.idf_)
    #sort weights from smallest to biggest and extract their indices 
    sort_by_tfidf = max_val.argsort()
    return feature_names[sort_by_tfidf]

In [12]:
#Compute the ordering (parameters can limit the max and min frequency)
tfidf_src = get_tfidf(corpus_src, 1.0, 0.0)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [93]:
#Save the tf-idf file (should not depend on target genre, so you might already have it)
pickle.dump(tfidf_src, open("webpage/"+src+"/tfidf.py", "wb"))

In [13]:
#Set all parameters to compute the MUSE embedding --> The important and time-consuming part
N_EPOCH = 50
BATCH_SIZE = 32
N_ITERATION = round(len(voc_src)/BATCH_SIZE)
REFINEMENT = 50 #Refinement is for the Procrustes Iterations, involved in the rotation of the embeddings
INPUT_GENRE = src
OUTPUT_GENRE = trg
MODEL_ROCK = 'data/MUSE/model_lyrics_'+ INPUT_GENRE +'.vec'
MODEL_POP = 'data/MUSE/model_lyrics_'+ OUTPUT_GENRE+'.vec'
SRC_LANG = 'MUSE_'+ INPUT_GENRE
TGT_LANG = 'MUSE_'+ OUTPUT_GENRE

In the following cell, you will need to change the path to the data/MUSE/MUSE-master/dumped/debug repository.

In [14]:
#We will use the following method to spare us the very long output and directly extract the result
def analyse_run(data):
    data = ' '.join(data)
    #You can change the path by analysing the logs in MUSE-master/dumped/debug
    dump = data.split('exp_path: /Users/emma/Cours/Sem_3/Lyrix/REPORT/data/MUSE/MUSE-master/dumped/debug/')[1].split(' ')[0]
    substring = data.split('* Best value for "mean_cosine-csls_knn_10-S2T-10000": ')[-1]
    iteration = substring.split('End of ')[1].split('. ')[0]
    best_mean_cosine = substring.split(' INFO')[0]
    print("For dump ", dump, " the best mean cosine was ", best_mean_cosine, " reached at ", iteration)
    return dump

In [95]:
#Time consuming cell
data = ! python data/MUSE/MUSE-master/unsupervised.py --src_lang $SRC_LANG --tgt_lang $TGT_LANG --src_emb $MODEL_ROCK --tgt_emb $MODEL_POP --n_epochs $N_EPOCH --epoch_size $N_ITERATION --batch_size $BATCH_SIZE --n_refinement $REFINEMENT
dump = analyse_run(data)

For dump  vlrd6t95c2  the best mean cosine was  0.59400  reached at  refinement iteration 29


In [26]:
#Get the files containing the MUSE embeddings previously computed
src_muse_emb = "data/MUSE/MUSE-master/dumped/debug/"+ dump + "/vectors-"+ SRC_LANG + ".txt"
trg_muse_emb = "data/MUSE/MUSE-master/dumped/debug/"+ dump + "/vectors-"+ TGT_LANG + ".txt"

In [27]:
#Load the embeddings
muse_emb_src, muse_voc_src = load_embeddings(src_muse_emb)
muse_emb_trg, muse_voc_trg = load_embeddings(trg_muse_emb)

In [None]:
#Save the target embeddings
pickle.dump(muse_emb_trg, open("webpage/"+trg+"/muse_emb_with_"+src+".py", "wb"))

In [28]:
#Function needed to map the embeddings to the vocabulary in a nicer way
def get_dict(embed, voc):
    voc_embeds_dict = {}
    embeds_voc_dict = {}
    for v, emb in zip(voc, embed):
        voc_embeds_dict[v] = tuple(emb)
        embeds_voc_dict[tuple(emb)] = v
    return voc_embeds_dict, embeds_voc_dict

In [29]:
#Compute the necessary dictionnaries
muse_voc2embed_src, _ = get_dict(muse_emb_src, muse_voc_src) 
_, muse_embed2voc_trg = get_dict(muse_emb_trg, muse_voc_trg) 

In [101]:
#Save the dictionaries
pickle.dump(muse_voc2embed_src, open("webpage/"+src+"/muse_voc2embed_with_"+trg+".py", "wb"))
pickle.dump(muse_embed2voc_trg, open("webpage/"+trg+"/muse_embed2voc_with_"+src+".py", "wb"))