In [12]:
import panphon
import numpy as np

ft = panphon.FeatureTable()

def ipa_to_feature_vector(ipa_string):
    segments = ft.ipa_segs(ipa_string)

    mapping = {'+': 1, '-': -1, '0': 0}

    vectors = [
        [mapping[feat] for feat in ft.segment_to_vector(seg)]
        for seg in segments
    ]

    return np.array(vectors, dtype=float)

ipa = 'ynivɛʁsalizəʁɔ̃'  # bonjour
vec = ipa_to_feature_vector(ipa)

print(f"IPA: {ipa}")
print(f"Feature vector: {vec}")


IPA: ynivɛʁsalizəʁɔ̃
Feature vector: [[ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0.  1.  1. -1. -1.
   1. -1.  1. -1.  0.  0.]
 [-1.  1.  1. -1. -1. -1.  1. -1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1.  1. -1. -1.
  -1. -1.  1. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1.  1. -1. -1.  1. -1.  0.  1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1. -1. -1. -1.
  -1. -1. -1. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1.  1. -1. -1. -1. -1.  0. -1. -1. -1.  1.
  -1. -1.  0. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1. -1.  1.  1.
  -1. -1.  1. -1.  0.  0.]
 [-1.  1.  1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1. 

In [1]:
import panphon
import numpy as np

ft = panphon.FeatureTable()

# (1) from the paper
def phoneme_feature_set(segment):
    features = ft.names
    vector = ft.segment_to_vector(segment)
    return {features[i] for i, val in enumerate(vector) if val == '+'}

#(2)
def bigram_feature_set(p1, p2):
    return phoneme_feature_set_extended(p1).union(phoneme_feature_set_extended(p2))

#(3) bigram similarity (jaccard similarity of bigram features)
def bigram_similarity(bg1, bg2):
    set1 = bigram_feature_set(*bg1)
    set2 = bigram_feature_set(*bg2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union else 0

def ipa_to_bigrams(ipa):
    phonemes = ft.ipa_segs(ipa)
    phonemes = ['BEG'] + phonemes + ['END']
    return list(zip(phonemes[:-1], phonemes[1:]))

def phoneme_feature_set_extended(segment):
    if segment == 'BEG':
        return {'beg'}
    elif segment == 'END':
        return {'end'}
    else:
        return phoneme_feature_set(segment)

#(5) from the paper
def word_similarity_bigrams(ipa1, ipa2):
    seq1 = ipa_to_bigrams(ipa1)
    seq2 = ipa_to_bigrams(ipa2)
    n, m = len(seq1), len(seq2)
    dp = np.zeros((n+1, m+1))

    for i in range(1, n+1):
        dp[i, 0] = dp[i-1, 0]
    for j in range(1, m+1):
        dp[0, j] = dp[0, j-1]

    for i in range(1, n+1):
        for j in range(1, m+1):
            sim = bigram_similarity(seq1[i-1], seq2[j-1])
            dp[i, j] = max(
                dp[i-1, j-1] + sim,  # match/substitution
                dp[i-1, j],          # deletion
                dp[i, j-1]           # insertion
            )

    # (6) from the paper normalize by the length of the longest sequence
    max_len = max(n, m)
    return dp[n, m] / max_len

ipa1 = 'tɥa'
ipa2 = 'tɥa'

similarity_score = word_similarity_bigrams(ipa1, ipa2)
print(f"similarity: {similarity_score:.3f}")
print(ipa_to_bigrams(ipa1))
print(ipa_to_bigrams(ipa2))

similarity: 1.000
[('BEG', 't'), ('t', 'ɥ'), ('ɥ', 'a'), ('a', 'END')]
[('BEG', 't'), ('t', 'ɥ'), ('ɥ', 'a'), ('a', 'END')]


In [2]:
import pandas as pd

df = pd.read_csv("converted_phrases.csv")
words = df["word"].tolist()
ipas = df["ipa"].tolist()

word_to_idx = {word: idx for idx, word in enumerate(words)}

In [3]:
from collections import Counter
from tqdm import tqdm

char_counter = Counter("".join(ipas))
phoneme_vocab = {ch: i + 1 for i, ch in enumerate(char_counter)}  # reserve 0 for PAD
phoneme_vocab['<PAD>'] = 0
vocab_size = len(phoneme_vocab)

In [4]:
def ipa_to_ids(ipa, max_len=30):
    ids = [phoneme_vocab.get(ch, 0) for ch in ipa]
    return ids[:max_len] + [0] * (max_len - len(ids))

ipa_ids = [ipa_to_ids(ipa) for ipa in ipas]

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        emb = self.embed(x)
        out, _ = self.lstm(emb)
        out = out.mean(dim=1)  # mean pooling
        return self.fc(out)

In [6]:
batch_size = 128
embed_dim = 32
hidden_dim = 64
output_dim = 50
encoder = BiLSTMEncoder(vocab_size, embed_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(10):
    total_loss = 0
    for i in tqdm(range(0, len(ipa_ids), batch_size)):
        batch_indices = list(range(i, min(i + batch_size, len(ipa_ids))))
        batch_x = [ipa_ids[idx] for idx in batch_indices]
        batch_y = np.random.choice(len(ipa_ids), len(batch_x), replace=False)

        x_tensor = torch.tensor(batch_x)
        y_tensor = torch.tensor([ipa_ids[j] for j in batch_y])

        emb_x = encoder(x_tensor)
        emb_y = encoder(y_tensor)

        emb_x = F.normalize(emb_x, dim=1)
        emb_y = F.normalize(emb_y, dim=1)

        pred_sim = torch.sum(emb_x * emb_y, dim=1)
        pred_sim = torch.clamp(pred_sim, 0, 1)

        target_sim = torch.tensor([
            word_similarity_bigrams(ipas[i + k], ipas[j])
            for k, j in enumerate(batch_y)
        ], dtype=torch.float)

        loss = loss_fn(pred_sim, target_sim)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

100%|██████████| 1922/1922 [24:57<00:00,  1.28it/s]


Epoch 1, Loss: 25.1928


100%|██████████| 1922/1922 [24:41<00:00,  1.30it/s]


Epoch 2, Loss: 11.3533


100%|██████████| 1922/1922 [24:58<00:00,  1.28it/s] 


Epoch 3, Loss: 7.3932


100%|██████████| 1922/1922 [24:42<00:00,  1.30it/s]


Epoch 4, Loss: 5.9039


100%|██████████| 1922/1922 [24:38<00:00,  1.30it/s]


Epoch 5, Loss: 5.3056


100%|██████████| 1922/1922 [24:34<00:00,  1.30it/s]


Epoch 6, Loss: 4.7384


100%|██████████| 1922/1922 [24:35<00:00,  1.30it/s]


Epoch 7, Loss: 4.4024


100%|██████████| 1922/1922 [24:42<00:00,  1.30it/s]


Epoch 8, Loss: 4.2182


100%|██████████| 1922/1922 [24:35<00:00,  1.30it/s]


Epoch 9, Loss: 3.9805


100%|██████████| 1922/1922 [24:35<00:00,  1.30it/s]

Epoch 10, Loss: 3.8504





In [7]:
def encode_all_words_batched(ipa_ids, batch_size=128):
    encoder.eval()
    all_embeds = []

    with torch.no_grad():
        for i in tqdm(range(0, len(ipa_ids), batch_size), desc="Encoding IPA embeddings"):
            batch = ipa_ids[i:i+batch_size]
            batch_tensor = torch.tensor(batch)
            emb = encoder(batch_tensor)
            emb = F.normalize(emb, dim=1)
            all_embeds.append(emb.cpu().numpy())

    return np.vstack(all_embeds)

embedding_matrix = encode_all_words_batched(ipa_ids)

Encoding IPA embeddings:   0%|          | 0/1922 [00:00<?, ?it/s]

Encoding IPA embeddings: 100%|██████████| 1922/1922 [00:17<00:00, 106.89it/s]


In [8]:
import faiss
faiss.normalize_L2(embedding_matrix)
index = faiss.IndexFlatIP(embedding_matrix.shape[1])
index.add(embedding_matrix)

: 

In [9]:
word_to_idx = {word: i for i, word in enumerate(words)}

In [10]:
def retrieve_similar_words(query_word, top_k=10):
    if query_word not in word_to_idx:
        raise ValueError(f"'{query_word}' not found in vocabulary.")

    query_idx = word_to_idx[query_word]
    query_emb = embedding_matrix[query_idx].reshape(1, -1)

    faiss.normalize_L2(query_emb)  

    similarities, indices = index.search(query_emb, top_k + 1)

    results = []
    for j, i in enumerate(indices[0]):
        if i != query_idx: 
            results.append((words[i], similarities[0][j]))

    return results[:top_k]

In [27]:
results = retrieve_similar_words("bonjour", top_k=10)
for word, score in results:
    print(f"  {word} (score: {score:.3f})")

  bonjours (score: 1.000)
  bûcheur (score: 0.918)
  bûcheurs (score: 0.918)
  bougèrent (score: 0.915)
  bouchèrent (score: 0.909)
  bouchères (score: 0.909)
  bouchère (score: 0.909)
  brochures (score: 0.902)
  brochure (score: 0.902)
  bougeoirs (score: 0.897)


In [14]:
torch.save(encoder.state_dict(), "bilstm_encoder.pth")

In [15]:
np.save("embedding_matrix.npy", embedding_matrix)