In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [42]:
import math
import itertools
from collections import defaultdict
from operator import itemgetter

import torch
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import normalize
from torch import nn
from torch.nn import functional as F

from phoneme_lm import PhonemeLM, build_data_loader, build_vocab, encode_pronunciation
from utils import load_data, split_data

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## Training the Model ##

In [4]:
df = load_data(True)
len(df)

124996

In [5]:
phoneme_to_idx, idx_to_phoneme = build_vocab(df.pronunciation.values)
df['phoneme_ids'] = df.pronunciation.apply(lambda pronunciation: encode_pronunciation(pronunciation, phoneme_to_idx))

In [6]:
df['pronunciation_string'] = df.pronunciation.apply(' '.join)
df['length'] = df.pronunciation.apply(len)

In [7]:
train_df, dev_df, test_df = split_data(df, dev_proportion=.2, test_proportion=.01)

In [8]:
len(train_df), len(dev_df), len(test_df)

(98746, 25000, 1250)

In [None]:
param_grid = ParameterGrid({
    'rnn_type': ['rnn', 'lstm', 'gru'],
    'embedding_dimension': [5, 10, 20, 50],
    'rnn_hidden_dimension': [5, 10, 20, 50, 100],
    'epochs': [10, 20, 40]
})

records = []
for params in param_grid:
    lm = PhonemeLM(phoneme_to_idx, idx_to_phoneme, **params)
    lm.fit(train_df.pronunciation, dev_df.pronunciation)
    
    params['model'] = lm
    records.append(params)

models_df = pd.DataFrame.from_records(records)

In [None]:
train_loader = build_data_loader(train_df.pronunciation, phoneme_to_idx, batch_size=1028)
dev_loader = build_data_loader(dev_df.pronunciation, phoneme_to_idx, batch_size=1028)

models_df['train_loss'] = models_df.model.apply(lambda model: model.evaluate(train_loader))
models_df['dev_loss'] = models_df.model.apply(lambda model: model.evaluate(dev_loader))

In [None]:
models_df

In [10]:
%%time

rnn_lm = PhonemeLM(phoneme_to_idx, idx_to_phoneme, 'rnn', embedding_dimension=10, rnn_hidden_dimension=20, batch_size=1028)
rnn_lm.fit(train_df.pronunciation, dev_df.pronunciation, epochs=10)

Epoch 1: train loss: 1.28310	assess loss: 1.7726
	 <W> <PAD> HH <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> OW2
	 JH D Y EH0 EY2 AA0 UH2 OY2 K
	 AE2 UW2 AE1 <PAD> <PAD> <PAD> <PAD> <PAD>
	 F JH IY2 IY1 AO1 S <PAD> AH0 <PAD> <PAD>
	 UW ER2 IY2 N <PAD> G <PAD> <PAD> <PAD> <PAD>
Epoch 2: train loss: 0.97544	assess loss: 1.4411
	 SH
	 UW2 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
	 SH AA0 AY0 IY1 <PAD> <PAD> OY0 <PAD> <PAD> <PAD>
	 F N G JH
	 OY2 L
Epoch 3: train loss: 0.85462	assess loss: 1.2931
	 IY0 B AW1 IY0 N UH1 <PAD> <PAD> <PAD> <PAD>
	 UW OW1
	 AA2
	 L T R
	 EY0 IY0 AY0 EY0 D
Epoch 4: train loss: 0.79675	assess loss: 1.2167
	 N CH Z K N
	 D DH T G N N <PAD> <PAD> <PAD> <PAD>
	 G F R ER2 Z Z IH0 M
	 S L K
	 N N
Epoch 5: train loss: 0.75958	assess loss: 1.1655
	 L AH2 T K AY1
	 
	 AE1 OY1 D EY2 Z OW1
	 EY1 IY0 L T D
	 AE1 K G T Z IH0
Epoch 6: train loss: 0.73292	assess loss: 1.1265
	 P S B T AH0 N AH0 T EH1 NG
	 OY1 N D
	 D EY2 UW R AA1 SH
	 K L ER0 N T EH1
	 M AH0 AO0 IH1 UW2 D 

## Probability of Real Words ##

In [None]:
%%time

df['probability'] = df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
df.sort_values('probability', ascending=False, inplace=True)
df.probability.hist(bins=100)

In [None]:
df[df.length == 3].head(20)

### Comparing Multiple Pronunciations ###

In [None]:
df[df.word == 'with']

In [None]:
df[df.word == 'tomato']

In [None]:
df[df.word=='pajamas']

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F', 'P'])

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F', 'B'])

### GENERATE PRONUNCIATIONS ###

In [49]:
for _ in range(10):
    pronunciation = rnn_lm.generate(100, temperature=None)
    pronunciation_string = ' '.join(pronunciation)
    matches = df[df.pronunciation_string == pronunciation_string]
    
    print(pronunciation_string)
    if len(matches) > 0:
        print('\t', matches.iloc[0]['word'], len(matches), 'total')
    print()

AE1 L
	 al 1 total

AO1 R T Z

L AA0 S IH1 SH AH0 L AH0 N

L IH1 S T AW2 AH0 Z

EH2 S IY1 N IH0 NG

AH0 L IH1 D W ER0 G IY0

AE1 B S

R AY1 AH0 N
	 rion 4 total

AA1 M L S

L EH2 K M T EH1 T ER0 Z



### Play with Embeddings ###

In [61]:
def most_similar_phonemes(lm, phoneme, topn=10):
    emb_one = lm.embedding_for(phoneme)

    other_to_sim = {
        phoneme: cosine_similarity(lm.embedding_for(phoneme), emb_one).item()
        for phoneme in phoneme_to_idx

    }

    for other_phoneme, similarity in sorted(other_to_sim.items(), key=lambda p: -p[1])[:topn]:
        if other_phoneme != phoneme:
            print(f'[{similarity:.3f}]\t{other_phoneme}')
        
most_similar_phonemes(lstm_lm, 'DH')

[0.769]	D
[0.621]	L
[0.564]	F
[0.463]	OW0
[0.454]	W
[0.450]	T
[0.443]	K
[0.439]	HH
[0.331]	B


In [66]:
embeddings = lstm_lm.embeddings
normed_embeddings = normalize(embeddings)

num_clusters = 3
kmeans = KMeans(num_clusters)
kmeans.fit(normed_embeddings)

grouped = defaultdict(set)
for idx, label in enumerate(kmeans.labels_):
    phoneme = lm.vocab[idx]
    grouped[label].add(phoneme)
grouped

defaultdict(set,
            {2: {'</W>',
              '<PAD>',
              'AA1',
              'AE1',
              'AH1',
              'AW0',
              'AW1',
              'AY0',
              'AY1',
              'EH1',
              'EH2',
              'ER1',
              'ER2',
              'EY0',
              'EY1',
              'IY0',
              'IY1',
              'IY2',
              'OW1',
              'OW2',
              'OY1',
              'OY2',
              'P',
              'UH1',
              'UW',
              'UW0',
              'UW2'},
             1: {'<W>',
              'AA0',
              'AA2',
              'AE0',
              'AE2',
              'AH0',
              'AH2',
              'AO0',
              'AO1',
              'AO2',
              'AW2',
              'AY2',
              'EH0',
              'EY2',
              'IH0',
              'IH1',
              'IH2',
              'UH0',
              'UH2',
          

In [71]:
# For consonants
# - voicing
# - place: bilabial, dental, alveolar, palatal, velar
# - manner: stop, fricative, nasal

# For vowels
# - front/back
# - closed/open
# - rounding

# General
# - syllabic

# TODO: combine multiple vectors together, e.g. {B, G, V, DH, D} for voiced
lm = lstm_lm
voicing = lm.embedding_for('B') - lm.embedding_for('P')
forwarding = lm.embedding_for('P') - lm.embedding_for('K')
frication = lm.embedding_for('F') - lm.embedding_for('P')

new = lm.embedding_for('TH') + voicing
# new = lm.embedding_for('K') + voicing
# new = lm.embedding_for('T') + frication

phoneme_to_sim = {}
for phoneme in phoneme_to_idx:
    this_embs = lm.embedding_for(phoneme)
    sim = cosine_similarity(new, this_embs).item()
    phoneme_to_sim[phoneme] = sim

sorted(phoneme_to_sim.items(), key=lambda p: -p[1])[:3]

[('T', 0.7106951475143433),
 ('TH', 0.6230912804603577),
 ('V', 0.5945394039154053)]

In [75]:
from phoneme_lm import START, END, PAD
# TODO: finish this
def analogy(lm, a, b, c):
    emb_a = lm.embedding_for(a)
    emb_b = lm.embedding_for(b)
    emb_c = lm.embedding_for(c)
    
    all_phonemes = set(lm.vocab) - {START, END, PAD}
    
    phoneme_to_diff = {}
    for phoneme in all_phonemes:
        if phoneme in {a, b, c}:
            continue
        
        emb = lm.embedding_for(phoneme)
        first = emb_a - emb_b
        second = emb_c - emb
        diff = abs(first - second)
        phoneme_to_diff[phoneme] = diff
    return phoneme_to_diff

p2d = analogy(lstm_lm, 'P', 'B', 'K')