In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import math
import itertools
from collections import defaultdict
from operator import itemgetter

import torch
import pandas as pd
# import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import normalize
from torch import nn
from torch.nn import functional as F
from tqdm.notebook import tqdm

from phoneme_lm import PhonemeLM, build_data_loader, build_vocab, encode_pronunciation
from utils import load_data, split_data

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## Loading the Data ##

In [4]:
df = load_data(True)
len(df)

124996

In [5]:
phoneme_to_idx, idx_to_phoneme = build_vocab(df.pronunciation.values)
df['phoneme_ids'] = df.pronunciation.apply(lambda pronunciation: encode_pronunciation(pronunciation, phoneme_to_idx))

In [6]:
df['pronunciation_string'] = df.pronunciation.apply(' '.join)
df['length'] = df.pronunciation.apply(len)

In [7]:
train_df, dev_df, test_df = split_data(df, dev_proportion=.2, test_proportion=.01)
len(train_df), len(dev_df), len(test_df)

(98746, 25000, 1250)

## Training the Model ##

In [8]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu', rnn_type='gru',
    embedding_dimension=100, hidden_dimension=100, num_layers=1,
    max_epochs=2, early_stopping_rounds=5,
    lr=5e-3, batch_size=1024, l2_strength=0
)


lm.fit(train_df.pronunciation, dev_df.pronunciation)

Epoch 1: train loss: 0.5828	assess loss: 0.9029
	 N P IH1 L D AH0 N AH0 B L
	 S R R S Z UW1
	 V OW1 N N IY0
	 M ER1 EH0 L IY0
	 P R AY1 K L AH0
Epoch 2: train loss: 0.5515	assess loss: 0.8553
	 UW2 M AO1 R S AH0 L
	 S P AE1 P T IH0 D
	 N AO1 B AH2 T S
	 S AO1 R S T
	 D AH1 M K S
CPU times: user 10min 13s, sys: 27.7 s, total: 10min 41s
Wall time: 2min 25s


([0.582791268825531, 0.5515130162239075],
 [0.9029074311256409, 0.8552573323249817])

In [9]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu', rnn_type='gru',
    embedding_dimension=100, hidden_dimension=100, num_layers=1,
    max_epochs=2, early_stopping_rounds=5,
    lr=5e-3, batch_size=1024, l2_strength=10
)


lm.fit(train_df.pronunciation, dev_df.pronunciation)

Epoch 1: train loss: 4.2299	assess loss: 4.2390
	 V SH AE2 AY1 AE1 <W> EY2 DH AE0 AE2
	 IY1 T N AY0 Y AW0 ER0 UH0 UH2 IY2
	 AY2 IY1 EH0 NG UH0 OW1 UW2 AY0 AH1
	 UW0 F UH1 M ER2 L CH IH0 OW0 UW1
	 W AE0 AH2 OY1 UH1 EY0 ZH UW1 B IY1
Epoch 2: train loss: 4.2299	assess loss: 4.2390
	 AA0 IH2 OY1 L
	 P AO2 IY1 F UW2 IY2 Z AH0 AY0 N
	 IY1 AE1 OY2 AO0 OY1 K OY1 D AY0 B
	 AH0 HH N ER1 N P UW Y UH0 M
	 AY1 Z NG EY1 UH0 JH OW0 HH AO1 S
CPU times: user 10min 14s, sys: 27.4 s, total: 10min 42s
Wall time: 2min 24s


([4.229948997497559, 4.22994327545166], [4.239027976989746, 4.238986015319824])

In [None]:
# %%time

# param_grid = ParameterGrid({
#     'batch_size': [2**7, 2**10, 2**12, 2**14],
#     'lr': [1e-4, 1e-3, 1e-2, 1e-1]
# })

# records = []
# for params in tqdm(param_grid):
#     lm = PhonemeLM(
#         phoneme_to_idx, device='cuda', rnn_type='gru', embedding_dimension=50, hidden_dimension=50,
#          max_epochs=200, early_stopping_rounds=3,
#         **params
#     )
    
#     print('Model Params:', params)
#     train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation)
#     for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
#         record = params.copy()
#         record['epoch'] = epoch
#         record['train_loss'] = train_loss
#         record['dev_loss'] = dev_loss
    
#         records.append(record)

# models_df = pd.DataFrame.from_records(records)

In [None]:
# models_df.sort_values('train_loss')

In [None]:
# models_df.sort_values('dev_loss')

In [None]:
g = models_df.groupby(['batch_size', 'lr'])

columns = 3
rows = int(math.ceil(len(g) / columns))
fig, axs = plt.subplots(rows, columns, figsize=(20, 10), sharey=True)
for idx, ((embedding_dimension, rnn_hidden_dimension), t) in enumerate(g):
    row, column = divmod(idx, columns)
    ax = axs[row][column]
    t.set_index('epoch').dev_loss.plot(ax=ax)
    t.set_index('epoch').train_loss.plot(ax=ax)
    ax.set_title(f'batch_size={embedding_dimension}, lr={rnn_hidden_dimension}')
    plt.tight_layout()
    plt.yscale('log')

In [None]:
print(models_df.batch_size.unique())
print(models_df.lr.unique())

In [None]:
models_df[models_df.batch_size==16384].sort_values('train_loss')

In [None]:
def plot(df, batch_size, lr):
    t = models_df[(models_df.batch_size==batch_size) & (models_df.lr==lr)].set_index('epoch')
    t.train_loss.plot()
    t.dev_loss.plot()
plot(df, 1024, .1)

In [None]:
# 63 minutes for 16 models. 4 minutes each

In [None]:
# %%time

# param_grid = ParameterGrid({
#     'rnn_type': ['gru'],
#     'embedding_dimension': [10, 50, 100, 200],
#     'rnn_hidden_dimension': [50, 100, 200, 400],
# })

# records = []
# for params in tqdm(param_grid):
#     lm = PhonemeLM(phoneme_to_idx, device='cuda', batch_size=1024,  max_epochs=200, **params)
#     print('Model Params:', params)
#     train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=10)
#     for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
#         record = params.copy()
#         record['epoch'] = epoch
#         record['train_loss'] = train_loss
#         record['dev_loss'] = dev_loss
    
#         records.append(record)

# models_df = pd.DataFrame.from_records(records)

In [None]:
models_df.embedding_dimension.unique()

In [None]:
models_df = pd.DataFrame.from_records(records)
models_df.sort_values('dev_loss')
t = models_df[(models_df.embedding_dimension==10) & (models_df.rnn_hidden_dimension==200)]
t.set_index('epoch').train_loss.plot()
t.set_index('epoch').dev_loss.plot()

In [None]:
# %%time

# param_grid = ParameterGrid({
#     'rnn_type': ['gru'],
#     'embedding_dimension': [10, 100, 400],
#     'rnn_hidden_dimension': [50, 200, 400],
# })

# records = []
# for params in tqdm(param_grid):
#     lm = PhonemeLM(phoneme_to_idx, device='cuda', batch_size=1024,  max_epochs=200, **params)
#     print('Model Params:', params)
#     train_losses, dev_losses = lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=3)
#     for epoch, (train_loss, dev_loss) in enumerate(zip(train_losses, dev_losses), start=1):
#         record = params.copy()
#         record['epoch'] = epoch
#         record['train_loss'] = train_loss
#         record['dev_loss'] = dev_loss
    
#         records.append(record)

# models_df = pd.DataFrame.from_records(records)

In [None]:
models_df = pd.DataFrame.from_records(records)

In [None]:
len(models_df)

In [None]:
models_df.sort_values('train_loss')

In [None]:
%%time

lm = PhonemeLM(
    phoneme_to_idx, device='cpu',
    rnn_type='gru', embedding_dimension=10, rnn_hidden_dimension=20,
    max_epochs=10
)

lm.fit(train_df.pronunciation, dev_df.pronunciation, early_stopping_rounds=10)

In [None]:
%%time

lm.fit(train_df.pronunciation, dev_df.pronunciation, max_epochs=5, early_stopping_rounds=5)

In [None]:
lm = torch.load('lm_1.pt', map_location=torch.device('cpu'))
lm.device = torch.device('cpu')

## Probability of Real Words ##

In [None]:
%%time

df['probability'] = df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
df.sort_values('probability', ascending=False, inplace=True)
df.probability.hist(bins=10)

In [None]:
ta = train_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))
da = dev_df.pronunciation.apply(lambda pronunciation: lm.calculate_probability(pronunciation))

In [None]:
ta.mean(), da.mean()

In [None]:
ta.describe()

In [None]:
da.describe()

In [None]:
df[df.length == 3]

### Comparing Multiple Pronunciations ###

In [None]:
df[df.word == 'with'].sort_values('probability', ascending=False)

In [None]:
df[df.word == 'tomato'].sort_values('probability', ascending=False)

In [None]:
df[df.word=='pajamas'].sort_values('probability', ascending=False)

In [None]:
df[df.word == 'february'].sort_values('probability', ascending=False)

In [None]:
df.word.value_counts()

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F', 'S'])

In [None]:
lm.calculate_probability(['P', 'R', 'IH1', 'F'])

In [None]:
lm.calculate_probability(['S', 'T', 'R', 'UW1', 'Z'])

### GENERATE PRONUNCIATIONS ###

In [None]:
for _ in range(10):
    pronunciation = lm.generate(100, temperature=None)
    pronunciation_string = ' '.join(pronunciation)
    matches = df[df.pronunciation_string == pronunciation_string]
    
    print(pronunciation_string)
    if len(matches) > 0:
        print('\t', matches.iloc[0]['word'], len(matches), 'total')
    print()

### Probability of Next ###

In [None]:
pronunciation = ['CH', 'EH0', 'N', 'V', 'AY2', 'R', 'AH0', 'N', 'M', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
# pronunciation = ['M', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
# pronunciation = ['S', 'EH1', 'N', 'T', 'AH0', 'L', 'IH2']
pronunciation = ['F', 'EH1', 'B', 'Y', 'AH0']


next_probs = lm.next_probabilities(pronunciation)

for phoneme, probability in sorted(next_probs.items(), key=lambda p: -p[1]):
    print(f'[{probability:.4f}] {phoneme}')

In [None]:
def probability_next(lm, pronunciation):
    

In [None]:
lm.calculate_probability(['S'])

In [None]:
encode_pronunciation(['S'], lm.phoneme_to_idx)

In [None]:
lm

In [None]:
torch.save(lm, 'lm.pt')

### Play with Embeddings ###

In [None]:
def most_similar_phonemes(lm, embedding, topn=10):
    other_to_sim = {
        phoneme: cosine_similarity(lm.embedding_for(phoneme), embedding).item()
        for phoneme in phoneme_to_idx

    }

    for other_phoneme, similarity in sorted(other_to_sim.items(), key=lambda p: -p[1])[:topn]:
        print(f'[{similarity:.3f}]\t{other_phoneme}')
        
most_similar_phonemes(lm, lm.embedding_for('DH'))

In [None]:
embeddings = lm.embeddings
# embeddings = lm.embedding.weight.cpu().detach().numpy()
normed_embeddings = normalize(embeddings)

num_clusters = 15
kmeans = KMeans(num_clusters)
kmeans.fit(normed_embeddings)

grouped = defaultdict(set)
for idx, label in enumerate(kmeans.labels_):
    phoneme = lm.vocab[idx]
    grouped[label].add(phoneme)
grouped

In [None]:
# For consonants
# - voicing
# - place: bilabial, dental, alveolar, palatal, velar
# - manner: stop, fricative, nasal

# For vowels
# - front/back
# - closed/open
# - rounding

# General
# - syllabic

# TODO: combine multiple vectors together, e.g. {B, G, V, DH, D} for voiced
voicing = lm.embedding_for('B') - lm.embedding_for('P')
forwarding = lm.embedding_for('P') - lm.embedding_for('K')
frication = lm.embedding_for('F') - lm.embedding_for('P')

# new = lm.embedding_for('TH') + voicing
# new = lm.embedding_for('K') + voicing
# new = lm.embedding_for('T') + frication
new = lm.embedding_for('G') + forwarding

phoneme_to_sim = {}
for phoneme in phoneme_to_idx:
    this_embs = lm.embedding_for(phoneme)
    sim = cosine_similarity(new, this_embs).item()
    phoneme_to_sim[phoneme] = sim

sorted(phoneme_to_sim.items(), key=lambda p: -p[1])[:5]

In [None]:
import numpy as np
# np.mean([embedding_for('B'), embedding_for('V')], axis=0)
voiced = np.mean([lm.embedding_for(phoneme) for phoneme in ['B', 'V', 'G', 'Z', 'ZH', 'DH', 'JH']], axis=0)
voiceless = np.mean([lm.embedding_for(phoneme) for phoneme in ['P', 'F', 'K', 'S', 'SH', 'TH', 'CH']], axis=0)
voicing = voiced - voiceless
most_similar_phonemes(lm, voicing + lm.embedding_for('S'))

In [None]:
most_similar_phonemes(lm, voicing)

In [None]:
from phoneme_lm import START, END, PAD
def analogy(lm, a, b, c):
    """
    a - b = c - w
    argmax(w) over sim(w, c - a + b)
    """
    emb_a = lm.embedding_for(a)
    emb_b = lm.embedding_for(b)
    emb_c = lm.embedding_for(c)
    
    all_phonemes = set(lm.vocab) - {START, END, PAD}
    
    phoneme_to_sim = {}
    for phoneme in all_phonemes:
        if phoneme in {a, b, c}:
            continue
        
        emb_p = lm.embedding_for(phoneme)
        sim = cosine_similarity(emb_p, emb_c - emb_a + emb_b)
        phoneme_to_sim[phoneme] = sim.item()
    return phoneme_to_sim

analogies = analogy(lm, 'P', 'K', 'B')
for phoneme, sim in sorted(analogies.items(), key=lambda p: -p[1]):
    print(f'[{sim:.4f}] {phoneme}')


In [None]:
from phoneme_lm import START
phoneme_idx = lm.phoneme_to_idx[START]

In [None]:
output, hidden_state = lm(torch.LongTensor([phoneme_idx]).unsqueeze(0))

In [None]:
hidden_state.shape

In [None]:
hidden = torch.zeros(1, 1, 10)

lm(torch.LongTensor([phoneme_idx]).unsqueeze(0), hidden)