In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import math
import itertools
from collections import defaultdict
from operator import itemgetter

import torch
import pandas as pd
# import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import normalize
from torch import nn
from torch.nn import functional as F
from tqdm.notebook import tqdm

from phoneme_lm import PhonemeLM, build_data_loader, build_vocab, encode_pronunciation
from utils import load_data, split_data

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## Training the Model ##

In [4]:
df = load_data(True)
len(df)

124996

In [5]:
phoneme_to_idx, idx_to_phoneme = build_vocab(df.pronunciation.values)
df['phoneme_ids'] = df.pronunciation.apply(lambda pronunciation: encode_pronunciation(pronunciation, phoneme_to_idx))

In [6]:
df['pronunciation_string'] = df.pronunciation.apply(' '.join)
df['length'] = df.pronunciation.apply(len)

In [7]:
train_df, dev_df, test_df = split_data(df, dev_proportion=.2, test_proportion=.01)

In [8]:
len(train_df), len(dev_df), len(test_df)

(98746, 25000, 1250)

In [9]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=5, rnn_hidden_dimension=20,
    max_epochs=2, early_stopping_rounds=2, dropout=0
)

lm.fit(train_df.pronunciation, dev_df.pronunciation)

Epoch 1: train loss: 0.8302	assess loss: 1.2645
	 N B <PAD> <PAD> <PAD> <PAD> <PAD> S
	 AY0 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
	 IY1 ER1 T
	 
	 L
Epoch 2: train loss: 0.7349	assess loss: 1.1323
	 N AH0 D IY0 S N
	 AA1 B M N
	 B AA0 T AH0 N L AE1 IH0 K Z
	 N M AW1 AW0 N N EY0
	 AA1 F AO2 NG
CPU times: user 9min 43s, sys: 30.1 s, total: 10min 13s
Wall time: 1min 47s


([0.8301621675491333, 0.734900176525116],
 [1.264542818069458, 1.1322858333587646])

In [10]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=5, rnn_hidden_dimension=20,
    max_epochs=2, early_stopping_rounds=2, dropout=1
)

lm.fit(train_df.pronunciation, dev_df.pronunciation)

Epoch 1: train loss: 3.4306	assess loss: 3.5711
	 AA0 UW1 V K EH1 G R IY1 AO2 EH1
	 UW1 EY2 UW1 Y
	 R <PAD> ER2 AE0 EH1 D T IY2 IH2 AW0
	 IH1 OY2 IH0 UH1 OY0 AW2 S AY1 OW1 AA1
	 OY2 Z Z AO0 V UW2 AH1 V AE0 AA0
Epoch 2: train loss: 2.9008	assess loss: 3.1189
	 AY1 UH1 IH2 UH2 OW1 UH2 EY1 EY1 S
	 UW OY2 AO1 TH <PAD> V <PAD>
	 S AA1 UW0 UH0 IY1 V EY1 UW0
	 IY2 F AY1 AY1 Y HH B UW1 OW0 F
	 UH1 IH1 TH AW0 EH2 IY0
CPU times: user 9min 45s, sys: 27.1 s, total: 10min 12s
Wall time: 1min 48s


([3.4306066036224365, 2.9007530212402344],
 [3.571077346801758, 3.1188955307006836])

In [None]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=5, rnn_hidden_dimension=20,
    max_epochs=2, early_stopping_rounds=2, dropout=0, lr=100
)

lm.fit(train_df.pronunciation, dev_df.pronunciation)

Epoch 1: train loss: 373.4124	assess loss: 577.9097
	 R R R R R R R R R R
	 R R R R R R R R R R
	 R R R R R R R R R R
	 R R R R R R R R R R
	 R R R R R R R R R R
Epoch 2; Batch 150 of 386; loss: 450.5679

In [None]:
# 63 minutes for 16 models. 4 minutes each

In [None]:
%%time

param_grid = ParameterGrid({
    'rnn_type': ['gru'],
    'embedding_dimension': [10, 50, 100, 200],
    'rnn_hidden_dimension': [50, 100, 200, 400],
})

records = []
for params in tqdm(param_grid):
    lm = PhonemeLM(phoneme_to_idx, device='cuda', batch_size=1024,  max_epochs=200, **params)
    print('Model Params:', params)
    train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=10)
    for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
        record = params.copy()
        record['epoch'] = epoch
        record['train_loss'] = train_loss
        record['dev_loss'] = dev_loss
    
        records.append(record)

models_df = pd.DataFrame.from_records(records)

In [None]:
models_df.embedding_dimension.unique()

In [None]:
models_df = pd.DataFrame.from_records(records)
models_df.sort_values('dev_loss')
t = models_df[(models_df.embedding_dimension==10) & (models_df.rnn_hidden_dimension==200)]
t.set_index('epoch').train_loss.plot()
t.set_index('epoch').dev_loss.plot()

In [None]:
%%time

param_grid = ParameterGrid({
    'rnn_type': ['gru'],
    'embedding_dimension': [10, 100, 400],
    'rnn_hidden_dimension': [50, 200, 400],
})

records = []
for params in tqdm(param_grid):
    lm = PhonemeLM(phoneme_to_idx, device='cuda', batch_size=1024,  max_epochs=200, **params)
    print('Model Params:', params)
    train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=3)
    for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
        record = params.copy()
        record['epoch'] = epoch
        record['train_loss'] = train_loss
        record['dev_loss'] = dev_loss
    
        records.append(record)

models_df = pd.DataFrame.from_records(records)

In [None]:
models_df = pd.DataFrame.from_records(records)

In [None]:
len(models_df)

In [None]:
models_df.sort_values('train_loss')

In [None]:
g = models_df.groupby(['embedding_dimension', 'rnn_hidden_dimension'])

columns = 3
rows = int(math.ceil(len(g) / columns))
fig, axs = plt.subplots(columns, rows, figsize=(15, 10), sharey=True)
for idx, ((embedding_dimension, rnn_hidden_dimension), t) in enumerate(g):
    row, column = divmod(idx, columns)
    ax = axs[row][column]
    t.set_index('epoch').dev_loss.plot(ax=ax)
    t.set_index('epoch').train_loss.plot(ax=ax)
    ax.set_title(f'embed dim={embedding_dimension}, hidden={rnn_hidden_dimension}')
    plt.tight_layout()

In [None]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=10, rnn_hidden_dimension=20,
    max_epochs=10
)

lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=10)

In [None]:
%%time

lm.fit(train_df.pronunciation, dev_df.pronunciation, max_epochs=5, early_stopping_rounds=5)

## Probability of Real Words ##

In [None]:
%%time

df['probability'] = df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
df.sort_values('probability', ascending=False, inplace=True)
df.probability.hist(bins=10)

In [None]:
ta = train_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))
da = dev_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
ta.mean(), da.mean()

In [None]:
ta.describe()

In [None]:
da.describe()

In [None]:
df[df.length == 3]

### Comparing Multiple Pronunciations ###

In [None]:
df[df.word == 'with'].sort_values('probability', ascending=False)

In [None]:
df[df.word == 'tomato'].sort_values('probability', ascending=False)

In [None]:
df[df.word=='pajamas'].sort_values('probability', ascending=False)

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F', 'S'])

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F', 'Z'])

### GENERATE PRONUNCIATIONS ###

In [None]:
for _ in range(10):
    pronunciation = lm.generate(100, temperature=None)
    pronunciation_string = ' '.join(pronunciation)
    matches = df[df.pronunciation_string == pronunciation_string]
    
    print(pronunciation_string)
    if len(matches) > 0:
        print('\t', matches.iloc[0]['word'], len(matches), 'total')
    print()

In [None]:
u.shape, v.shape

In [None]:
u = lm.embedding_for('UW')
v = lm.embedding_for('V')
cosine_similarity(u, v)

### Play with Embeddings ###

In [None]:


def most_similar_phonemes(lm, embedding, topn=10):
    other_to_sim = {
        phoneme: cosine_similarity(lm.embedding_for(phoneme), embedding).item()
        for phoneme in phoneme_to_idx

    }

    for other_phoneme, similarity in sorted(other_to_sim.items(), key=lambda p: -p[1])[:topn]:
        if other_phoneme != phoneme:
            print(f'[{similarity:.3f}]\t{other_phoneme}')
        
most_similar_phonemes(lm, lm.embedding_for('V'))

In [None]:
embeddings = lm.embeddings
# embeddings = lm.embedding.weight.cpu().detach().numpy()
normed_embeddings = normalize(embeddings)

num_clusters = 15
kmeans = KMeans(num_clusters)
kmeans.fit(normed_embeddings)

grouped = defaultdict(set)
for idx, label in enumerate(kmeans.labels_):
    phoneme = lm.vocab[idx]
    grouped[label].add(phoneme)
grouped

In [None]:
df.set_index('word').loc['fish']

In [None]:
# For consonants
# - voicing
# - place: bilabial, dental, alveolar, palatal, velar
# - manner: stop, fricative, nasal

# For vowels
# - front/back
# - closed/open
# - rounding

# General
# - syllabic

# TODO: combine multiple vectors together, e.g. {B, G, V, DH, D} for voiced
voicing = lm.embedding_for('B') - lm.embedding_for('P')
forwarding = lm.embedding_for('P') - lm.embedding_for('K')
frication = lm.embedding_for('F') - lm.embedding_for('P')

new = lm.embedding_for('TH') + voicing
# new = lm.embedding_for('K') + voicing
# new = lm.embedding_for('T') + frication
# new = lm.embedding_for('K') + forwarding

phoneme_to_sim = {}
for phoneme in phoneme_to_idx:
    this_embs = lm.embedding_for(phoneme)
    sim = cosine_similarity(new, this_embs).item()
    phoneme_to_sim[phoneme] = sim

sorted(phoneme_to_sim.items(), key=lambda p: -p[1])[:5]

In [None]:
import numpy as np
# np.mean([embedding_for('B'), embedding_for('V')], axis=0)
voiced = np.mean([lm.embedding_for(phoneme) for phoneme in ['B', 'V', 'G', 'Z', 'ZH', 'DH', 'JH']], axis=0)
voiceless = np.mean([lm.embedding_for(phoneme) for phoneme in ['P', 'F', 'K', 'S', 'SH', 'TH', 'CH']], axis=0)
voicing = voiced - voiceless
most_similar_phonemes(lm, voicing + lm.embedding_for('S'))

In [None]:
most_similar_phonemes(lm, voicing)

In [None]:
from phoneme_lm import START, END, PAD
def analogy(lm, a, b, c):
    """
    a - b = c - w
    argmax(w) over sim(w, c - a + b)
    """
    emb_a = lm.embedding_for(a)
    emb_b = lm.embedding_for(b)
    emb_c = lm.embedding_for(c)
    
    all_phonemes = set(lm.vocab) - {START, END, PAD}
    
    phoneme_to_sim = {}
    for phoneme in all_phonemes:
        if phoneme in {a, b, c}:
            continue
        
        emb_p = lm.embedding_for(phoneme)
        sim = cosine_similarity(emb_p, emb_c - emb_a + emb_b)
        phoneme_to_sim[phoneme] = sim.item()
    return phoneme_to_sim

analogies = analogy(lm, 'P', 'B', 'K')
for phoneme, sim in sorted(analogies.items(), key=lambda p: -p[1]):
    print(f'[{sim:.4}] {phoneme}')


In [None]:
embeddings.shape

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3).fit_transform(embeddings)
tsne.shape