In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import math
import itertools
from collections import defaultdict
from operator import itemgetter

import torch
import pandas as pd
# import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import normalize
from torch import nn
from torch.nn import functional as F
from tqdm.notebook import tqdm

from phoneme_lm import PhonemeLM, build_data_loader, build_vocab, encode_pronunciation
from utils import load_data, split_data

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## Loading the Data ##

In [4]:
df = load_data(True)
len(df)

124996

In [5]:
phoneme_to_idx, idx_to_phoneme = build_vocab(df.pronunciation.values)
df['phoneme_ids'] = df.pronunciation.apply(lambda pronunciation: encode_pronunciation(pronunciation, phoneme_to_idx))

In [6]:
df['pronunciation_string'] = df.pronunciation.apply(' '.join)
df['length'] = df.pronunciation.apply(len)

In [7]:
train_df, dev_df, test_df = split_data(df, dev_proportion=.2, test_proportion=.01)
len(train_df), len(dev_df), len(test_df)

(98746, 25000, 1250)

## Training the Model ##

In [17]:
# %%time

# lm = PhonemeLM(
#     phoneme_to_idx, device='cuda', rnn_type='gru',
#     embedding_dimension=20, hidden_dimension=100, num_layers=3,
#     max_epochs=2000, early_stopping_rounds=3,
#     lr=1e-3, batch_size=1024
# )


# train_loss, dev_loss = lm.fit(train_df.pronunciation.values.tolist(), dev_df.pronunciation.values.tolist())

In [16]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cuda', rnn_type='gru',
    embedding_dimension=20, hidden_dimension=100, num_layers=3,
    max_epochs=2000, early_stopping_rounds=3,
    lr=1e-3, batch_size=1024, l2_strength=1e-4
)


train_loss, dev_loss = lm.fit(train_df.pronunciation.values.tolist(), dev_df.pronunciation.values.tolist())

Epoch 1: train loss: 0.7981	assess loss: 1.2296
	Generated: in train: 2%, assess: 0%, novel: 98%
	 AA0 R IH1 EY0 N T N EY1
	 UH2 EY1 HH ER1 D
	 IH2 IY1 EH0 OW2 TH
	 JH AE2 TH N AH0 D N AH0 EY1
	 AE1 M AW1 AH0 B
Epoch 2: train loss: 0.7184	assess loss: 1.1110
	Generated: in train: 2%, assess: 0%, novel: 98%
	 AA0 D K AH0 ER0
	 K ER0 F K AH0 L L IY0 K
	 EY2 T AE1 Z IH0 T AH0
	 AA1 K R M S
	 L JH IH0 N
Epoch 3: train loss: 0.6996	assess loss: 1.0826
	Generated: in train: 5%, assess: 0%, novel: 95%
	 N K AH1 S AH0 Z
	 N ER1 N ER1 AH0 V N AH0 AO0
	 S IH0 M IY0 AA2 R L G
	 IY1 M D Z Z IY0
	 AH1 G R EY1 S
Epoch 4: train loss: 0.6702	assess loss: 1.0372
	Generated: in train: 2%, assess: 1%, novel: 97%
	 S T ER1 R ER0
	 P R AH0 AA1 Z G Z
	 K IH1 OW1 D ER0 AH0 R
	 K S CH L
	 NG EH1 Y S T N IH0 T L
Epoch 5: train loss: 0.6359	assess loss: 0.9843
	Generated: in train: 7%, assess: 1%, novel: 92%
	 R AE0 B AO1 B AH0 N S T AH0 K
	 K AA0 NG AH0 D EH2 L T D T IY0
	 M AO1 ER1 AH1 EY2
	 N IH1 K IH0 S AH0

Epoch 42: train loss: 0.5332	assess loss: 0.8283
	Generated: in train: 16%, assess: 1%, novel: 83%
	 HH AE1 M AH0 K AA1 N
	 B OW1 L D Z
	 B EH1 L
	 P IH1 L AH0
	 T AY2 P AE1 S T AH0 K S
Epoch 43: train loss: 0.5320	assess loss: 0.8263
	Generated: in train: 12%, assess: 3%, novel: 85%
	 V IH0 L AA1 N Z
	 OW1 M AA0 F
	 P AH0 M EH1 L
	 N AA1 L AH0 N S AH2 N
	 EH1 N W IY0 M
Epoch 44: train loss: 0.5311	assess loss: 0.8251
	Generated: in train: 9%, assess: 0%, novel: 91%
	 B R IY1 NG M AH0 N
	 OW1 R Z EH2 D
	 AA1 R W IH2 T IH0 K
	 B R AA1 DH IY0
	 M OW1 T R ER2 K ER0
Epoch 45: train loss: 0.5307	assess loss: 0.8245
	Generated: in train: 15%, assess: 2%, novel: 83%
	 EH1 G L IY0
	 AA2 M AH0 M OW1 N IY0 AH0 N
	 K R AO1 Z
	 B OY1 T AH0 N
	 K AE1 S IY1
Epoch 46: train loss: 0.5303	assess loss: 0.8241
	Generated: in train: 13%, assess: 2%, novel: 85%
	 R IH1 S T R IH0 S T
	 M IH1 NG K AH0 L
	 P AA1 R B AH0 L OW0
	 G R EY1 B
	 G R AO1 S K AE2 T ER0 Z
Epoch 47: train loss: 0.5317	assess loss: 0.82

Epoch 84: train loss: 0.5232	assess loss: 0.8144
	Generated: in train: 13%, assess: 0%, novel: 87%
	 AH1 N D AH0 L EY2 T IH0 NG
	 D ER1 K
	 D IY0 B AY1 T
	 Y IY1 B AH0 L
	 AH0 N B EH1 T S
Epoch 85: train loss: 0.5229	assess loss: 0.8139
	Generated: in train: 9%, assess: 3%, novel: 88%
	 JH OW1 P IH0 D
	 HH AW2 M AH0 R EH1 L AH0 D EY2 T AH0 D
	 HH AE1 B AH0 L ER0
	 B AW1 AH0 N T IH0 M
	 V EH1 L OW0
Epoch 86: train loss: 0.5236	assess loss: 0.8147
	Generated: in train: 16%, assess: 2%, novel: 82%
	 HH EH1 R IY0 Z
	 S AH0 N T EY0 S EH1 M
	 T AA1 R JH IY0
	 AE1 M AY2 AH0 JH IH0 S
	 T AH1 F
Epoch 87: train loss: 0.5226	assess loss: 0.8134
	Generated: in train: 14%, assess: 3%, novel: 83%
	 HH AO1 R JH S
	 L EY1 P IH0 NG
	 D R AA2 R G R Y UW1
	 F ER1 Z
	 M AE1 T ER0
Epoch 88: train loss: 0.5221	assess loss: 0.8126
	Generated: in train: 17%, assess: 2%, novel: 81%
	 S AH1 R D
	 M AH0 K N OW1 T IH0 D
	 B R AY1 ER0
	 S EH1 L
	 K AO1 R M AH0 N
Epoch 89: train loss: 0.5225	assess loss: 0.8133
	Ge

Epoch 125: train loss: 0.5185	assess loss: 0.8079
	Generated: in train: 18%, assess: 3%, novel: 79%
	 K AA1 W IY0
	 S AE1 M
	 F R AE1 K AH0 L
	 W EH1 N S IH0 Z
	 S T EH1 L CH ER0 Z
Epoch 126: train loss: 0.5190	assess loss: 0.8087
	Generated: in train: 14%, assess: 1%, novel: 85%
	 F AA1 R D M B OW2 T
	 JH EH1 P T AH0 N
	 L IH1 K Y AH0 N Z
	 V EH1 CH IH0 B OW2 AH0
	 R EH1 M B K IH2 T
Epoch 127: train loss: 0.5196	assess loss: 0.8096
	Generated: in train: 11%, assess: 3%, novel: 86%
	 B AA1 L ER0
	 B ER1 ER0 IY0
	 D IH0 M B L UW1 SH AO0
	 OW2 G OW0 N T EH2 R IH0 T AH0 G AH1 N JH AH0 L Z
	 P R IH0 K AH1 N S IY0
Epoch 128: train loss: 0.5190	assess loss: 0.8087
	Generated: in train: 18%, assess: 2%, novel: 80%
	 K EH1 R IY0
	 SH EH2 S T W AH0 Z IH1 K F AH0 L IH0 S
	 R IY0 P EH0 K Y UW1 N AH0 L IY0
	 AE2 S K AA0 R Z AE1 L IH0 JH IH0 S T
	 G OW1 L Y UW2 Z
Epoch 129: train loss: 0.5186	assess loss: 0.8080
	Generated: in train: 13%, assess: 1%, novel: 86%
	 G ER1 Z AH0 B IH0 M Z
	 AY1 D L AY2

Epoch 166: train loss: 0.5168	assess loss: 0.8058
	Generated: in train: 12%, assess: 5%, novel: 83%
	 S AE1 P S
	 AE1 L S IH0 N B ER0 G ER0
	 T ER1 AH0
	 K IY1 Z AE2 N
	 P R IY2 Z IY0 Y AH0 S EH1 L ER0 Z
Epoch 167: train loss: 0.5168	assess loss: 0.8059
	Generated: in train: 15%, assess: 3%, novel: 82%
	 K AA0 M EH1 D AH0
	 B UW1 Z
	 D IH1 K AH0 B ER2 CH D
	 AH0 L EH1 R IY0
	 IH0 N G R AH1 S IH0 L Z
Epoch 168: train loss: 0.5164	assess loss: 0.8051
	Generated: in train: 14%, assess: 1%, novel: 85%
	 N UH1 N F ER2 ER0
	 P R AA1 M AH0 L
	 AE0 K T AE1 M IH0 K S
	 M AH0 S IY1 T M AH0 N T
	 Y AA2 R IY0 N UW1 T OW0
Epoch 169: train loss: 0.5166	assess loss: 0.8054
	Generated: in train: 16%, assess: 1%, novel: 83%
	 CH EY1 N
	 B OW1 N IY0
	 P AA1 M AH0 N AY0 Z IH0 NG
	 AH0 N B AO1 R M IH0 NG
	 P IH1 Z AH0 L D Z
Epoch 170: train loss: 0.5162	assess loss: 0.8048
	Generated: in train: 11%, assess: 2%, novel: 87%
	 HH AE1 D ER0 D
	 K L AW1 S T IY0
	 D IH2 S K W IY0 ZH OW1 N OW0
	 K AE1 L AH0 N AH

Epoch 207: train loss: 0.5154	assess loss: 0.8038
	Generated: in train: 9%, assess: 0%, novel: 91%
	 K AA2 CH IY0 AA1 SH AH0
	 S UH0 R P AA1 L OW0
	 IH0 L IH1 T AH0 B AH0 L
	 K Y OW1 N Y AH0
	 L AE1 S K AH0 N D TH
Epoch 208: train loss: 0.5152	assess loss: 0.8036
	Generated: in train: 11%, assess: 1%, novel: 88%
	 W OW1 K AH0 M
	 EY1 G HH EY2 L D Z
	 B EH1 N AH0 G EH2 S T S
	 F L IY0 AA1 T AH0 K
	 P ER1 K L AW2 IH0 NG
Epoch 209: train loss: 0.5152	assess loss: 0.8036
	Generated: in train: 11%, assess: 1%, novel: 88%
	 L AA0 N T AA1 N D
	 Y UW2 L AH0 Z EH1 L
	 HH OW1 K IY0
	 SH OW1 K ER0
	 F EY1 T S
Epoch 210: train loss: 0.5165	assess loss: 0.8058
	Generated: in train: 18%, assess: 2%, novel: 80%
	 M EY1 DH D IH0 NG
	 T EY1 D IH0 NG
	 P IH0 K AE1 N T IH0 NG
	 AE1 M AH0 L ER0
	 K ER0 M AA1 N IH0 S T
Epoch 211: train loss: 0.5156	assess loss: 0.8040
	Generated: in train: 15%, assess: 2%, novel: 83%
	 L EY1 D ER0 L IY0
	 V ZH IY1 D AH0 N
	 K AH1 D ER0 Z
	 G R EY1 N
	 G AH0 L OW1
Epoch 212

Early stopping because of no decrease in 3 epochs.


In [None]:
# Try to fit a model that gets train and validation loss to about .7

In [None]:
# models_df.sort_values('train_loss')

In [None]:
# models_df.sort_values('dev_loss')

In [None]:
g = models_df.groupby(['batch_size', 'lr'])

columns = 3
rows = int(math.ceil(len(g) / columns))
fig, axs = plt.subplots(rows, columns, figsize=(20, 10), sharey=True)
for idx, ((embedding_dimension, rnn_hidden_dimension), t) in enumerate(g):
    row, column = divmod(idx, columns)
    ax = axs[row][column]
    t.set_index('epoch').dev_loss.plot(ax=ax)
    t.set_index('epoch').train_loss.plot(ax=ax)
    ax.set_title(f'batch_size={embedding_dimension}, lr={rnn_hidden_dimension}')
    plt.tight_layout()
    plt.yscale('log')

In [None]:
print(models_df.batch_size.unique())
print(models_df.lr.unique())

In [None]:
models_df[models_df.batch_size==16384].sort_values('train_loss')

In [None]:
def plot(df, batch_size, lr):
    t = models_df[(models_df.batch_size==batch_size) & (models_df.lr==lr)].set_index('epoch')
    t.train_loss.plot()
    t.dev_loss.plot()
plot(df, 1024, .1)

In [None]:
# 63 minutes for 16 models. 4 minutes each

In [None]:
%%time

param_grid = ParameterGrid({
    'embedding_dimension': [20, 50, 100, 200],
    'rnn_hidden_dimension': [50, 100, 200, 400],
    'num_layers': [1, 2, 3],
})

records = []
for params in tqdm(param_grid):
    lm = PhonemeLM(
        phoneme_to_idx, rnn_type='gru', device='cuda', batch_size=1024,
        max_epochs=2000, early_stopping_rounds=3, **params
    )
    print('Model Params:', params)
    train_losses, dev_losses = lm.fit(train_df.pronunciation.values.tolist(), dev_df.pronunciation.values.tolist())
    for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
        record = params.copy()
        record['epoch'] = epoch
        record['train_loss'] = train_loss
        record['dev_loss'] = dev_loss
    
        records.append(record)

models_df = pd.DataFrame.from_records(records)

In [None]:
models_df.embedding_dimension.unique()

In [None]:
models_df = pd.DataFrame.from_records(records)
models_df.sort_values('dev_loss')
t = models_df[(models_df.embedding_dimension==10) & (models_df.rnn_hidden_dimension==200)]
t.set_index('epoch').train_loss.plot()
t.set_index('epoch').dev_loss.plot()

In [None]:
# %%time

# param_grid = ParameterGrid({
#     'rnn_type': ['gru'],
#     'embedding_dimension': [10, 100, 400],
#     'rnn_hidden_dimension': [50, 200, 400],
# })

# records = []
# for params in tqdm(param_grid):
#     lm = PhonemeLM(phoneme_to_idx, device='cuda', batch_size=1024,  max_epochs=200, **params)
#     print('Model Params:', params)
#     train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=3)
#     for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
#         record = params.copy()
#         record['epoch'] = epoch
#         record['train_loss'] = train_loss
#         record['dev_loss'] = dev_loss
    
#         records.append(record)

# models_df = pd.DataFrame.from_records(records)

In [None]:
models_df = pd.DataFrame.from_records(records)

In [None]:
len(models_df)

In [None]:
models_df.sort_values('train_loss')

In [None]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=10, rnn_hidden_dimension=20,
    max_epochs=10
)

lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=10)

In [None]:
%%time

lm.fit(train_df.pronunciation, dev_df.pronunciation, max_epochs=5, early_stopping_rounds=5)

In [None]:
lm = torch.load('lm_1.pt', map_location=torch.device('cpu'))
lm.device = torch.device('cpu')

## Probability of Real Words ##

In [None]:
%%time

df['probability'] = df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
df.sort_values('probability', ascending=False, inplace=True)
df.probability.hist(bins=10)

In [None]:
ta = train_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))
da = dev_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
ta.mean(), da.mean()

In [None]:
ta.describe()

In [None]:
da.describe()

In [None]:
df[df.length == 3]

In [None]:
lm = lm8

### Comparing Multiple Pronunciations ###

In [None]:
df[df.word == 'with'].sort_values('probability', ascending=False)

In [None]:
df[df.word == 'tomato'].sort_values('probability', ascending=False)

In [None]:
df[df.word=='pajamas'].sort_values('probability', ascending=False)

In [None]:
df[df.word == 'february'].sort_values('probability', ascending=False)

In [None]:
df.word.value_counts()

In [None]:
lm.calculate_probability(['P', 'R', 'IY1', 'M'])

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'M'])

In [None]:
lm.calculate_probability(['S', 'T', 'R', 'UW1', 'Z'])

### GENERATE PRONUNCIATIONS ###

In [None]:
for _ in range(10):
    pronunciation = lm.generate(100, temperature=None)
    pronunciation_string = ' '.join(pronunciation)
    matches = df[df.pronunciation_string == pronunciation_string]
    
    print(pronunciation_string)
    if len(matches) > 0:
        print('\t', matches.iloc[0]['word'], len(matches), 'total')
    print()

### Probability of Next ###

In [None]:
pronunciation = ['CH', 'EH0', 'N', 'V', 'AY2', 'R', 'AH0', 'N', 'M', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
# pronunciation = ['M', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
# pronunciation = ['S', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
pronunciation = ['F', 'EH1', 'B', 'Y', 'AH0']


next_probs = lm.next_probabilities(pronunciation)

for phoneme, probability in sorted(next_probs.items(), key=lambda p: -p[1]):
    print(f'[{probability:.4f}] {phoneme}')

In [None]:
def probability_next(lm, pronunciation):
    

In [None]:
lm.calculate_probability(['S'])

In [None]:
encode_pronunciation(['S'], lm.phoneme_to_idx)

In [None]:
lm

In [None]:
torch.save(lm, 'lm.pt')

### Play with Embeddings ###

In [None]:
def most_similar_phonemes(lm, embedding, topn=10):
    other_to_sim = {
        phoneme: cosine_similarity(lm.embedding_for(phoneme), embedding).item()
        for phoneme in phoneme_to_idx

    }

    for other_phoneme, similarity in sorted(other_to_sim.items(), key=lambda p: -p[1])[:topn]:
        print(f'[{similarity:.3f}]\t{other_phoneme}')
        
most_similar_phonemes(lm, lm.embedding_for('DH'))

In [None]:
embeddings = lm.embeddings
# embeddings = lm.embedding.weight.cpu().detach().numpy()
normed_embeddings = normalize(embeddings)

num_clusters = 15
kmeans = KMeans(num_clusters)
kmeans.fit(normed_embeddings)

grouped = defaultdict(set)
for idx, label in enumerate(kmeans.labels_):
    phoneme = lm.vocab[idx]
    grouped[label].add(phoneme)
grouped

In [None]:
# For consonants
# - voicing
# - place: bilabial, dental, alveolar, palatal, velar
# - manner: stop, fricative, nasal

# For vowels
# - front/back
# - closed/open
# - rounding

# General
# - syllabic

# TODO: combine multiple vectors together, e.g. {B, G, V, DH, D} for voiced
voicing = lm.embedding_for('B') - lm.embedding_for('P')
forwarding = lm.embedding_for('P') - lm.embedding_for('K')
frication = lm.embedding_for('F') - lm.embedding_for('P')

# new = lm.embedding_for('TH') + voicing
new = lm.embedding_for('K') + voicing
# new = lm.embedding_for('T') + frication
# new = lm.embedding_for('G') + forwarding

phoneme_to_sim = {}
for phoneme in phoneme_to_idx:
    this_embs = lm.embedding_for(phoneme)
    sim = cosine_similarity(new, this_embs).item()
    phoneme_to_sim[phoneme] = sim

sorted(phoneme_to_sim.items(), key=lambda p: -p[1])[:10]

In [None]:
import numpy as np
# np.mean([embedding_for('B'), embedding_for('V')], axis=0)
voiced = np.mean([lm.embedding_for(phoneme) for phoneme in ['B', 'V', 'G', 'Z', 'ZH', 'DH', 'JH']], axis=0)
voiceless = np.mean([lm.embedding_for(phoneme) for phoneme in ['P', 'F', 'K', 'S', 'SH', 'TH', 'CH']], axis=0)
voicing = voiced - voiceless
most_similar_phonemes(lm, voicing + lm.embedding_for('S'))

In [None]:
most_similar_phonemes(lm, voicing)

In [None]:
from phoneme_lm import START, END, PAD
def analogy(lm, a, b, c):
    """
    a - b = c - w
    argmax(w) over sim(w, c - a + b)
    """
    emb_a = lm.embedding_for(a)
    emb_b = lm.embedding_for(b)
    emb_c = lm.embedding_for(c)
    
    all_phonemes = set(lm.vocab) - {START, END, PAD}
    
    phoneme_to_sim = {}
    for phoneme in all_phonemes:
        if phoneme in {a, b, c}:
            continue
        
        emb_p = lm.embedding_for(phoneme)
        sim = cosine_similarity(emb_p, emb_c - emb_a + emb_b)
        phoneme_to_sim[phoneme] = sim.item()
    return phoneme_to_sim

analogies = analogy(lm, 'P', 'K', 'B')
for phoneme, sim in sorted(analogies.items(), key=lambda p: -p[1]):
    print(f'[{sim:.4f}] {phoneme}')


In [None]:
from phoneme_lm import START
phoneme_idx = lm.phoneme_to_idx[START]

In [None]:
output, hidden_state = lm(torch.LongTensor([phoneme_idx]).unsqueeze(0))

In [None]:
hidden_state.shape

In [None]:
hidden = torch.zeros(1, 1, 10)

lm(torch.LongTensor([phoneme_idx]).unsqueeze(0), hidden)