In [2]:
import itertools
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from tqdm import tqdm

# Load the phonetic dictionary from the uploaded CSV file
# Assuming the delimiter is a tab ('\t') and the file has a header row
df = pd.read_csv('/content/en_UK.txt', delimiter='\t', header=0)
# If the column names are 'aah' and '/ˈɑː/' as in your example:
phonetic_dict = dict(zip(df['aah'], df['/ˈɑː/']))
# If the column names are different, adjust accordingly

# Define articulatory features for phonemes
articulatory_features = {
    't': {'place': 'alveolar', 'manner': 'stop', 'voicing': 'voiceless'},
    'ʔ': {'place': 'glottal', 'manner': 'stop', 'voicing': 'voiceless'},
    'd': {'place': 'alveolar', 'manner': 'stop', 'voicing': 'voiced'},
    's': {'place': 'alveolar', 'manner': 'fricative', 'voicing': 'voiceless'},
    'z': {'place': 'alveolar', 'manner': 'fricative', 'voicing': 'voiced'},
    'ə': {'place': 'central', 'manner': 'vowel', 'voicing': 'voiced'},
    'ʃ': {'place': 'postalveolar', 'manner': 'fricative', 'voicing': 'voiceless'},
    'ʒ': {'place': 'postalveolar', 'manner': 'fricative', 'voicing': 'voiced'},
    'p': {'place': 'bilabial', 'manner': 'stop', 'voicing': 'voiceless'},
    'b': {'place': 'bilabial', 'manner': 'stop', 'voicing': 'voiced'},
    'k': {'place': 'velar', 'manner': 'stop', 'voicing': 'voiceless'},
    'g': {'place': 'velar', 'manner': 'stop', 'voicing': 'voiced'},
    'f': {'place': 'labiodental', 'manner': 'fricative', 'voicing': 'voiceless'},
    'v': {'place': 'labiodental', 'manner': 'fricative', 'voicing': 'voiced'},
    'θ': {'place': 'dental', 'manner': 'fricative', 'voicing': 'voiceless'},
    'ð': {'place': 'dental', 'manner': 'fricative', 'voicing': 'voiced'},
    'm': {'place': 'bilabial', 'manner': 'nasal', 'voicing': 'voiced'},
    'n': {'place': 'alveolar', 'manner': 'nasal', 'voicing': 'voiced'},
    'ŋ': {'place': 'velar', 'manner': 'nasal', 'voicing': 'voiced'},
    'h': {'place': 'glottal', 'manner': 'fricative', 'voicing': 'voiceless'},
    'j': {'place': 'palatal', 'manner': 'approximant', 'voicing': 'voiced'},
    'w': {'place': 'labio-velar', 'manner': 'approximant', 'voicing': 'voiced'},
    'r': {'place': 'alveolar', 'manner': 'approximant', 'voicing': 'voiced'},
    'l': {'place': 'alveolar', 'manner': 'lateral approximant', 'voicing': 'voiced'},
    # Add more phonemes...
}

# Phoneme mapping between similar sounds
phoneme_mapping = {
    'i': ['i', 'ɪ'],
    'ɪ': ['i', 'ɪ'],
    'eɪ': ['e', 'eɪ'],
    'ɛ': ['ɛ', 'e'],
    'æ': ['a', 'æ'],
    'ɑ': ['a', 'ɑ'],
    'ɔ': ['o', 'ɔ'],
    'ə': ['ə', 'ʌ'],
    'ʌ': ['ə', 'ʌ'],
    't': ['t', 'ʔ', 'ɾ'],
    'd': ['t', 'ɾ'],
    's': ['s', 'z'],
    'z': ['s', 'z'],
    'ʃ': ['ʃ', 'ʒ'],
    'ʒ': ['ʃ', 'ʒ'],
    'n': ['n', 'ŋ'],
    'ŋ': ['n', 'ŋ'],
    'l': ['l', 'ɫ'],
    'r': ['r', 'ɹ'],
    'p': ['p', 'b'],
    'b': ['p', 'b'],
    'k': ['k', 'g'],
    'g': ['k', 'g'],
    'f': ['f', 'v'],
    'v': ['f', 'v'],
    'θ': ['θ', 'ð'],
    'ð': ['θ', 'ð'],
    'h': ['h', 'ʔ'],
    'w': ['w', 'v'],
    # Add more based on phonetic similarities
}

# Function to extract phonemes from a word using the phonetic dictionary
def get_phonemes(word):
    return phonetic_dict.get(word.lower(), '')

# Function to save embeddings to a text file
def save_embeddings_to_file(model, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        for word in model.wv.index_to_key:
            embedding = model.wv[word]
            embedding_str = ','.join(map(str, embedding))
            file.write(f"{word}\t{embedding_str}\n")

# Function to train Word2Vec on phoneme sequences and save embeddings
def train_and_save_embeddings(vocabulary, output_path):
    phoneme_sequences = [list(get_phonemes(word)) for word in tqdm(vocabulary, desc="Generating phoneme sequences")]
    phoneme_sequences = [seq for seq in phoneme_sequences if seq]  # Filter empty sequences
    model = Word2Vec(phoneme_sequences, vector_size=100, window=5, min_count=1, workers=4)
    save_embeddings_to_file(model, output_path)

# Vocabulary for embedding training
vocabulary = [str(word) for word in phonetic_dict.keys()]  # Convert keys to strings

# Train embeddings and save to file
embedding_output_path = '/content/phoneme_embeddings.txt'
train_and_save_embeddings(vocabulary, embedding_output_path)

# Notify completion
print(f"Embeddings saved to {embedding_output_path}")



Generating phoneme sequences:   0%|          | 0/65117 [00:00<?, ?it/s][A
Generating phoneme sequences:  13%|█▎        | 8660/65117 [00:00<00:01, 54242.98it/s][A
Generating phoneme sequences: 100%|██████████| 65117/65117 [00:00<00:00, 208499.59it/s]


Embeddings saved to /content/phoneme_embeddings.txt


In [4]:
import itertools
from gensim.models import Word2Vec
import pandas as pd
from tqdm import tqdm

# Load the English phonetic dictionary
english_df = pd.read_csv('/content/en_UK.txt', delimiter='\t', header=0)
english_phonetic_dict = dict(zip(english_df['aah'], english_df['/ˈɑː/']))

# Load the French phonetic dictionary
french_df = pd.read_csv('/content/fr_FR.txt', delimiter='\t', header=0)
# The column names in the French dataframe were 'a' and '/a/' respectively instead of 'word' and 'phonetic'
french_phonetic_dict = dict(zip(french_df['a'], french_df['/a/']))  # Changed to the actual column names

# Combine dictionaries into a single dataset for embedding training
def combine_phonetic_dicts(en_dict, fr_dict):
    combined = []
    for word, phonemes in en_dict.items():
        combined.append(list(phonemes))
    for word, phonemes in fr_dict.items():
        combined.append(list(phonemes))
    return combined

# Train Word2Vec on phoneme sequences and save embeddings
def train_and_save_embeddings(phoneme_sequences, output_path):
    model = Word2Vec(phoneme_sequences, vector_size=100, window=5, min_count=1, workers=4)
    with open(output_path, 'w', encoding='utf-8') as file:
        for word in model.wv.index_to_key:
            embedding = model.wv[word]
            embedding_str = ','.join(map(str, embedding))
            file.write(f"{word}\t{embedding_str}\n")

# Generate phoneme sequences from the combined dataset
phoneme_sequences = combine_phonetic_dicts(english_phonetic_dict, french_phonetic_dict)

# Output file for embeddings
embedding_output_path = '/content/phoneme_embeddings_combined.txt'

# Train and save embeddings
train_and_save_embeddings(phoneme_sequences, embedding_output_path)

# Notify completion
print(f"Embeddings for English and French saved to {embedding_output_path}")


Generating phoneme sequences:  57%|█████▋    | 37235/65117 [09:14<06:55, 67.13it/s]


Embeddings for English and French saved to /content/phoneme_embeddings_combined.txt


In [7]:
import itertools
from gensim.models import Word2Vec
import pandas as pd
from tqdm import tqdm

# Load linguistic rules and phoneme mappings
with open('/content/linguistic_rules_and_mappings (1).txt', 'r') as file:
    exec(file.read())

# Load the English phonetic dictionary
english_df = pd.read_csv('/content/en_UK.txt', delimiter='\t', header=0)
english_phonetic_dict = dict(zip(english_df['aah'], english_df['/ˈɑː/']))

# Load the French phonetic dictionary
french_df = pd.read_csv('/content/fr_FR.txt', delimiter='\t', header=0)
# The column names in the French dataframe were 'a' and '/a/' respectively instead of 'word' and 'phonetic'
# Changed to the actual column names: 'a' and '/a/'
french_phonetic_dict = dict(zip(french_df['a'], french_df['/a/']))

# Expand phoneme sequences using phoneme_mapping
def expand_phoneme_sequence(sequence):
    expanded = []
    for phoneme in sequence:
        variants = phoneme_mapping.get(phoneme, [phoneme])
        expanded.extend(variants)
    return expanded

# Combine dictionaries into a single dataset for embedding training
def combine_phonetic_dicts(en_dict, fr_dict):
    combined = []
    for word, phonemes in en_dict.items():
        expanded_phonemes = expand_phoneme_sequence(phonemes)
        combined.append(expanded_phonemes)
    for word, phonemes in fr_dict.items():
        expanded_phonemes = expand_phoneme_sequence(phonemes)
        combined.append(expanded_phonemes)
    return combined

# Train Word2Vec on phoneme sequences and save embeddings
def train_and_save_embeddings(phoneme_sequences, output_path):
    model = Word2Vec(phoneme_sequences, vector_size=100, window=5, min_count=1, workers=4)
    with open(output_path, 'w', encoding='utf-8') as file:
        for word in model.wv.index_to_key:
            embedding = model.wv[word]
            embedding_str = ','.join(map(str, embedding))
            file.write(f"{word}\t{embedding_str}\n")

# Generate phoneme sequences from the combined dataset
phoneme_sequences = combine_phonetic_dicts(english_phonetic_dict, french_phonetic_dict)

# Output file for embeddings
embedding_output_path = '/content/phoneme_embeddings_combined_expanded.txt'

# Train and save embeddings
train_and_save_embeddings(phoneme_sequences, embedding_output_path)

# Notify completion
print(f"Embeddings for English and French with expanded mappings saved to {embedding_output_path}")


Embeddings for English and French with expanded mappings saved to /content/phoneme_embeddings_combined_expanded.txt


In [8]:
import numpy as np
from tqdm import tqdm

# Load embeddings from file
embedding_file_path = "/content/phoneme_embeddings_combined_expanded.txt"
embeddings = {}

with open(embedding_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            phoneme, embedding_str = parts
            embedding = np.array([float(value) for value in embedding_str.split(',')])
            embeddings[phoneme] = embedding

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Generate iterative phoneme strings based on embeddings
def generate_iterative_strings(embeddings, threshold=0.85):
    phoneme_list = list(embeddings.keys())
    results = []

    for i, phoneme1 in enumerate(tqdm(phoneme_list, desc="Generating strings")):
        for phoneme2 in phoneme_list[i+1:]:
            similarity = cosine_similarity(embeddings[phoneme1], embeddings[phoneme2])
            if similarity >= threshold:
                results.append((phoneme1, phoneme2, similarity))

    return results

# Run iterative string generation
results = generate_iterative_strings(embeddings)

# Save results to a file
output_file_path = "C:\\Users\\Lenovo\\Downloads\\phoneme_string_matches.txt"
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for phoneme1, phoneme2, similarity in results:
        output_file.write(f"{phoneme1}\t{phoneme2}\t{similarity:.4f}\n")

print(f"Generated phoneme string matches saved to {output_file_path}")


Generating strings: 100%|██████████| 74/74 [00:00<00:00, 4075.72it/s]

Generated phoneme string matches saved to C:\Users\Lenovo\Downloads\phoneme_string_matches.txt



