# ML Project : chord embedding

In [1]:
# Useful starting lines
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Load the data
When loading the key dataset, we can choose whether to drop one-word sentences.  
When loading the chord dataset, we can choose whether to keep sections in major or minor key, or both.

In [2]:
from load_data import load_chord_data, load_key_data, all_composers

In [3]:
# Optional parameter for load_key_data: drop_one_worders = True/False
bach_key = load_key_data(all_composers)

# Optional parameter for load_chord_data: key_mode = 'both'/'major'/'minor'
bach_chord_both = load_chord_data(all_composers, key_mode='both')

## Apply Word2Vec

In [4]:
from gensim.models import Word2Vec

In [5]:
# Ignore words with a lower frequency frequency than this
min_count = 10
# Size of the embedding space
size = 20 
# Neighborhood of the focus word to study
window = 2
# 0 for CBOW, 1 for skip-gram
sg = 0 

# The first argument has to be a list of lists of words
model_bach_both = Word2Vec(bach_chord_both, min_count=min_count, size=size, window=window, sg=sg)

## Reduce dimensionality and visualise

### Reduce dimensionality: PCA

In [6]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from reduce_dim import reduce_dim_keyed_vec

In [7]:
pca = PCA(n_components=2)
tsne = TSNE(n_components=2)
wv_red = reduce_dim_keyed_vec(model_bach_both.wv, pca.fit_transform)

### Visualise

In [8]:
from visual import visual_reduced_chord_vectors

In [None]:
visual_reduced_chord_vectors(wv_red, dimred_method='PCA', plot_title='W2V. Bach: both keys')

## Investigate similarities

In [None]:
model_bach_both.wv.similarity('MINOR;I:MIN', 'MINOR;V:DIM')

In [None]:
model_bach_both.wv.most_similar('MINOR;I:MIN', topn = 10)

In [None]:
model_bach_both.wv.most_similar('MAJOR;I:MIN', topn = 100)

In [29]:
def all_in_one(composers, key_mode = 'both', min_count = 10, size = 20, window=2, sg=0,
               dimred_method = 'PCA', plot_title = 'W2V. Bach: both keys', topn = 4, draw_grath=True, 
               print_similarities=True ):
    
    # Optional parameter for load_chord_data: key_mode = 'both'/'major'/'minor'
    chords = load_chord_data(composers, key_mode)

    # The first argument has to be a list of lists of words
    model = Word2Vec(chords, min_count=min_count, size=size, window=window, sg=sg)
    
    pca = PCA(n_components=2)
    tsne = TSNE(n_components=2)
    wv_red = None 
    if dimred_method=='PCA':
        wv_red = reduce_dim_keyed_vec(model.wv, pca.fit_transform)
    elif dimred_method=='TSNE':
        wv_red = reduce_dim_keyed_vec(model.wv, tsne.fit_transform)
    
    if draw_grath:
        visual_reduced_chord_vectors(wv_red, dimred_method = dimred_method, plot_title=plot_title)
    
    if print_similarities:
        sorted_chords=list(model.wv.vocab.keys())
        sorted_chords.sort()
        for chord in sorted_chords:
            similar=':'
            for neighbour, similarity in model.wv.most_similar(chord, topn=topn):
                similar +=f' ({neighbour}, {similarity:.3f}),'
            print(chord + similar)
