In [None]:
!pip install panphon
!pip install faiss-cpu


Collecting panphon
  Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting unicodecsv (from panphon)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading panphon-0.21.2-py2.py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Building wheels for collected packages: unicodecsv
  Building wheel for unicodecsv (setup.py) ... [?25l[?25hdone
  Created wheel for unicodecsv: filename=unicodecsv-0.14.1-py3-none-any.whl size=10744 sha256=0d0a09415dff6ab98141fab4b083fae99ea16da407f9898d0e1c137679d72f06
  Stored in directory: /root/.cache/pip/wheels/ec/03/6f/d2e0162d94c0d451556fa43dd4d5531457245c34a36b41ef4a
Successfully built unicodecsv
Installing collected packa

In [None]:
import panphon
import numpy as np

ft = panphon.FeatureTable()

def ipa_to_feature_vector(ipa_string):
    segments = ft.ipa_segs(ipa_string)

    mapping = {'+': 1, '-': -1, '0': 0}

    vectors = [
        [mapping[feat] for feat in ft.segment_to_vector(seg)]
        for seg in segments
    ]

    return np.array(vectors, dtype=float)

ipa = 'ynivɛʁsalizəʁɔ̃'  # bonjour
vec = ipa_to_feature_vector(ipa)

print(f"IPA: {ipa}")
print(f"Feature vector: {vec}")


IPA: ynivɛʁsalizəʁɔ̃
Feature vector: [[ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0.  1.  1. -1. -1.
   1. -1.  1. -1.  0.  0.]
 [-1.  1.  1. -1. -1. -1.  1. -1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1.  1. -1. -1.
  -1. -1.  1. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1.  1. -1. -1.  1. -1.  0.  1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1. -1. -1. -1.
  -1. -1. -1. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1.  1. -1. -1. -1. -1.  0. -1. -1. -1.  1.
  -1. -1.  0. -1.  0.  0.]
 [-1. -1.  1.  1. -1. -1. -1.  1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1.  0. -1. -1.  1.  1.
  -1. -1.  1. -1.  0.  0.]
 [-1.  1.  1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.
  -1. -1.  0. -1.  0.  0.]
 [ 1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1.  0. -1. 

In [None]:
import panphon
import numpy as np

ft = panphon.FeatureTable()

# (1) from the paper
def phoneme_feature_set(segment):
    features = ft.names
    vector = ft.segment_to_vector(segment)
    return {features[i] for i, val in enumerate(vector) if val == '+'}

#(2)
def bigram_feature_set(p1, p2):
    return phoneme_feature_set_extended(p1).union(phoneme_feature_set_extended(p2))

#(3) bigram similarity (jaccard similarity of bigram features)
def bigram_similarity(bg1, bg2):
    set1 = bigram_feature_set(*bg1)
    set2 = bigram_feature_set(*bg2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union else 0

def ipa_to_bigrams(ipa):
    phonemes = ft.ipa_segs(ipa)
    phonemes = ['BEG'] + phonemes + ['END']
    return list(zip(phonemes[:-1], phonemes[1:]))

def phoneme_feature_set_extended(segment):
    if segment == 'BEG':
        return {'beg'}
    elif segment == 'END':
        return {'end'}
    else:
        return phoneme_feature_set(segment)

#(5) from the paper
def word_similarity_bigrams(ipa1, ipa2):
    seq1 = ipa_to_bigrams(ipa1)
    seq2 = ipa_to_bigrams(ipa2)
    n, m = len(seq1), len(seq2)
    dp = np.zeros((n+1, m+1))

    for i in range(1, n+1):
        dp[i, 0] = dp[i-1, 0]
    for j in range(1, m+1):
        dp[0, j] = dp[0, j-1]

    for i in range(1, n+1):
        for j in range(1, m+1):
            sim = bigram_similarity(seq1[i-1], seq2[j-1])
            dp[i, j] = max(
                dp[i-1, j-1] + sim,  # match/substitution
                dp[i-1, j],          # deletion
                dp[i, j-1]           # insertion
            )

    # (6) from the paper normalize by the length of the longest sequence
    max_len = max(n, m)
    return dp[n, m] / max_len

ipa1 = 'tɥa'
ipa2 = 'tɥa'

similarity_score = word_similarity_bigrams(ipa1, ipa2)
print(f"similarity: {similarity_score:.3f}")
print(ipa_to_bigrams(ipa1))
print(ipa_to_bigrams(ipa2))

similarity: 1.000
[('BEG', 't'), ('t', 'ɥ'), ('ɥ', 'a'), ('a', 'END')]
[('BEG', 't'), ('t', 'ɥ'), ('ɥ', 'a'), ('a', 'END')]


In [None]:
import pandas as pd

df = pd.read_csv("/converted_phrases.csv")
words = df["word"].tolist()
ipas = df["ipa"].tolist()

word_to_idx = {word: idx for idx, word in enumerate(words)}

In [None]:
from collections import Counter
from tqdm import tqdm

char_counter = Counter("".join(ipas))
phoneme_vocab = {ch: i + 1 for i, ch in enumerate(char_counter)}  # reserve 0 for PAD
phoneme_vocab['<PAD>'] = 0
vocab_size = len(phoneme_vocab)

In [None]:
def ipa_to_ids(ipa, max_len=30):
    ids = [phoneme_vocab.get(ch, 0) for ch in ipa]
    return ids[:max_len] + [0] * (max_len - len(ids))

ipa_ids = [ipa_to_ids(ipa) for ipa in ipas]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        emb = self.embed(x)
        out, _ = self.lstm(emb)
        out = out.mean(dim=1)  # mean pooling
        return self.fc(out)

In [None]:
batch_size = 128
embed_dim = 32
hidden_dim = 64
output_dim = 50
encoder = BiLSTMEncoder(vocab_size, embed_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(5):
    total_loss = 0
    for i in tqdm(range(0, len(ipa_ids), batch_size)):
        batch_indices = list(range(i, min(i + batch_size, len(ipa_ids))))
        batch_x = [ipa_ids[idx] for idx in batch_indices]
        batch_y = np.random.choice(len(ipa_ids), len(batch_x), replace=False)

        x_tensor = torch.tensor(batch_x)
        y_tensor = torch.tensor([ipa_ids[j] for j in batch_y])

        emb_x = encoder(x_tensor)
        emb_y = encoder(y_tensor)

        emb_x = F.normalize(emb_x, dim=1)
        emb_y = F.normalize(emb_y, dim=1)

        pred_sim = torch.sum(emb_x * emb_y, dim=1)
        pred_sim = torch.clamp(pred_sim, 0, 1)

        target_sim = torch.tensor([
            word_similarity_bigrams(ipas[i + k], ipas[j])
            for k, j in enumerate(batch_y)
        ], dtype=torch.float)

        loss = loss_fn(pred_sim, target_sim)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

100%|██████████| 1922/1922 [13:12<00:00,  2.42it/s]


Epoch 1, Loss: 25.3110


100%|██████████| 1922/1922 [13:01<00:00,  2.46it/s]


Epoch 2, Loss: 11.3486


100%|██████████| 1922/1922 [13:03<00:00,  2.45it/s]


Epoch 3, Loss: 7.4746


100%|██████████| 1922/1922 [12:59<00:00,  2.47it/s]


Epoch 4, Loss: 5.8989


100%|██████████| 1922/1922 [13:05<00:00,  2.45it/s]

Epoch 5, Loss: 5.0179





In [None]:
import faiss

# Convert embeddings to numpy array
embedding_matrix = embeddings.weight.detach().numpy()

# Normalize embeddings
faiss.normalize_L2(embedding_matrix)

# Build FAISS index for fast similarity search
index = faiss.IndexFlatIP(embedding_dim)
index.add(embedding_matrix)

# Retrieval function
def retrieve_similar_words(query_word, top_k=10):
    idx = word_to_idx[query_word]
    query_emb = embedding_matrix[idx].reshape(1, -1)
    faiss.normalize_L2(query_emb)

    similarities, indices = index.search(query_emb, top_k+1)

    # Exclude the word itself from the results
    results = [(words[i], similarities[0][j])
               for j, i in enumerate(indices[0]) if i != idx]

    return results[:top_k]

# Example retrieval:
print(retrieve_similar_words('bonjour'))


[('daigneras', np.float32(0.60518396)), ('dureriez', np.float32(0.54355437)), ('branches', np.float32(0.5393211)), ('cafte', np.float32(0.5342453)), ('aiguisait', np.float32(0.52750987)), ('redoutas', np.float32(0.5220129)), ('corsages', np.float32(0.5125512)), ('tantouze', np.float32(0.51016873)), ('ajustaient', np.float32(0.50593585)), ('grognassions', np.float32(0.5027506))]
