In [None]:
import csv 
import random
from random import choice

import nltk
from nltk.util import ngrams
from nltk.probability import *
import numpy as np

from collections import Counter

In [None]:
# MAKE SURE YOU UPLOAD THE CORRECT FILES SO THAT COLAB CAN USE IT (see sidebar)
# training_sentences.csv
# cmu_in_childes_small.csv
# testing_sentences.csv

In [None]:
# DIANA'S ORIGINAL CODE 

def trigrams_in_sent(sent):
    return [ngram for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def all_trigrams(sents):
    return [ngram for sent in sents for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def trigram_freq_dist(all_trigrams):
    return nltk.FreqDist(all_trigrams)

def kneser_ney_prob_dist(sents):
    trigram_list = all_trigrams(sents)
    fd = trigram_freq_dist(trigram_list)
    return KneserNeyProbDist(fd)

def kneser_ney_prob(trigram, kn_prob_dist):
    return kn_prob_dist.prob(trigram)

def process_word(word):
    new_word = word
    new_word = new_word.replace('[', '')
    new_word = new_word.replace(']', '')
    new_word = new_word.replace("'", "")
    new_word = new_word.replace(' ', '')
    new_word = new_word.replace('"', '')
    new_word = new_word.lower()
    return new_word

def create_kneser_ney_prob_dist():
    sents_for_dist = []
    with open ('training_sentences.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                sent = row[29]
                split_sent = sent.split(",")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                sents_for_dist.append(split_sent_list)
            line_count += 1
    return kneser_ney_prob_dist(sents_for_dist)

def create_english_cmu_dict():
    english_cmu_dict = {}
    with open ('cmu_in_childes_small.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                english_cmu_dict[row[1]] = [row[8],row[10],row[11],row[12],row[13],row[14],row[15]]
            line_count += 1
    return english_cmu_dict

def create_english_words(english_dict):
    return list(english_dict.keys())

def init_random_sentence(english_words, num_words):
    sent = []
    for i in range(num_words):
        new_word_ind = random.randint(0, len(english_words)-1)
        new_word = english_words[new_word_ind]
        sent.append(new_word)
    return sent

def perturb_sentence(english_words, sent):
    new_sent = sent.copy()
    num_words = len(sent)
    change_ind = random.randint(0, num_words-1)
    new_word_ind = random.randint(0, len(english_words)-1)
    new_word = english_words[new_word_ind]
    new_sent[change_ind] = new_word
    return new_sent

def english_to_ipa(english_dict, sent):
    all_ipa_sent=[]
    for i in range(7):
      ipa_sent = []
      for word in sent:
          ipa_sent.append(english_dict[word][i])
      all_ipa_sent.append(ipa_sent)
    return all_ipa_sent

def trigram_prior_for_sent(sent, kn_prob_dist):
    trigrams = trigrams_in_sent(sent)
    num_trigrams = len(trigrams)
    trigram_priors = np.zeros(num_trigrams)
    for i in range(num_trigrams):
        trigram_priors[i] = kneser_ney_prob(trigrams[i], kn_prob_dist)
    return np.prod(trigram_priors)

def calculate_likelihood(guess, actual, match_multiplier=0.99):
    matches = 0
    no_matches = 0
    for i in range(len(actual)):
        found_match = False
        for x in range(7):
          if guess[x][i] == actual[i] and not found_match:
              matches += 1
              found_match = True
        else:
            no_matches += 1
    no_match_multiplier = 1-match_multiplier
    likelihood = (match_multiplier**matches)*(no_match_multiplier**no_matches)
    return likelihood

def calculate_prob(english_dict, kn_prob_dist, guess, actual, match_multiplier=0.99):
    guess_ipa = english_to_ipa(english_dict, guess)
    return trigram_prior_for_sent(guess, kn_prob_dist)*calculate_likelihood(guess_ipa, actual, match_multiplier) + 0.00001

def find_n_most_common_transcriptions(n, plausible_transcriptions):
    new_list = []
    for transcription in plausible_transcriptions:
        new_list.append(' '.join(transcription))
    counter = Counter(new_list)
    common_list = counter.most_common(n)
    print(common_list)
    return common_list

def transcribe(actual):
    plausible_transcriptions = []
    num_words = len(actual)
    english_dict = create_english_cmu_dict()
    english_words = create_english_words(english_dict)
    kn_prob_dist = create_kneser_ney_prob_dist()
    old_guess = init_random_sentence(english_words, num_words)
    for i in range(500000):
        new_guess = perturb_sentence(english_words, old_guess)
        old_guess_prob = calculate_prob(english_dict, kn_prob_dist, old_guess, actual)
        new_guess_prob = calculate_prob(english_dict, kn_prob_dist, new_guess, actual)
        if new_guess_prob/old_guess_prob > random.random():
            old_guess = new_guess
        plausible_transcriptions.append(old_guess)
    return find_n_most_common_transcriptions(10, plausible_transcriptions)

def get_all_transcriptions():
    all_transcriptions = []
    with open ('test_sentences.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                sent = row[29]
                split_sent = sent.split(",")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                all_transcriptions.append(split_sent_list)
            line_count += 1
    return all_transcriptions

In [None]:
#transcribe(['mɑmi'])
#transcribe(['pɑp', 'pɑp', 'pɑp', 'pɑp', 'pɑp'])
#transcribe(['wəts', 'ðɪs', 'fɔɹ'])
#transcribe(['oʊ'])
transcribe(['θæŋk', 'ju'])

[('digging plate', 4), ('name drinking', 3), ("cow's gail", 3), ('aislinn were', 3), ("nothing's bet", 3), ('inside fox', 3), ('to love', 3), ('meal card', 3), ('choke peeping', 3), ('carry paws', 3)]


In [None]:
import pandas as pd
df = pd.read_csv('testing_sentences.csv')
new = df.copy()

In [None]:
def get_mcmc_translation(ipa):
  ipa_split = ipa.split(" ")
  transcription = transcribe(ipa_split)[0][0]
  print(transcription)
  return transcription

new['mcmc sentences'] = new["adjusted_model_phonology"].apply(lambda x: get_mcmc_translation(x))

[('knives henry', 4), ('eaten dolly', 3), ('pole drives', 3), ('loft especially', 3), ('knows err', 3), ('girl filling', 3), ('messed light', 3), ('thomas vase', 3), ('ears rocket', 3), ('groceries hens', 3)]
knives henry
[('wife jen', 3), ('punch themselves', 3), ('night sandwiches', 3), ('choo shell', 3), ('nah nobody', 3), ('he able', 3), ('cheat bottles', 3), ('otter skating', 3), ('spray enough', 3), ('shaped toot', 3)]
wife jen
[('love doodle wanted hoop police', 2), ('program parade curious dining peep', 2), ('clap scared door nine builder', 2), ("smells robin junk children's crisp", 2), ('buy millions horses caravan castle', 2), ('up kim eyes nicer american', 2), ('donald pole rice calf outfit', 2), ('donna neighbor rest pickles fingers', 2), ('dinosaurs spilt stir patrick diamond', 2), ('neck bib soldiers oy lock', 2)]
love doodle wanted hoop police
[('build basket comes', 2), ('tuck candy smelly', 2), ('fix wicked danny', 2), ('brushes nail magnet', 2), ('doughnut unless know

In [None]:
new

Unnamed: 0,id,gloss,stem,actual_phonology,model_phonology,type,language,num_morphemes,num_tokens,utterance_order,...,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,ipa_sentence,adjusted_model_phonology,transcripted_from_ipa_sentence,length,mcmc sentences
0,17498348,uhoh,uhoh,əhæ,ʌ oʊ,declarative,eng,1,1,193,...,21,336,23495,23495,43379,"['ə', 'oʊ']",ə oʊ,"['a', 'oh']",2,knives henry
1,17306905,where blue,where blue,wɛ bu,wɛɹ bluː,question,eng,2,2,1414,...,21,336,23471,23471,43236,"['wɛɹ', 'blu']",wɛɹ blu,"['where', 'blue']",2,wife jen
2,17608013,grizzly+bear a grizzly+bear,a,hʊði beɪ^ə ə ɡwɪðli beɪə,ɡɹɪzliː bɛɹ ə ɡɹɪzliː bɛɹ,declarative,eng,1,3,1060,...,21,336,23510,23510,43468,"['ɡɹɪzli', 'bɛɹ', 'ə', 'ɡɹɪzli', 'bɛɹ']",ɡɹɪzli bɛɹ ə ɡɹɪzli bɛɹ,"['grizzly', 'bear', 'a', 'grizzly', 'bear']",5,love doodle wanted hoop police
3,17466822,what's his name,what his name,wəts hɪz neɪm,wʌts hɪz neɪm,question,eng,4,3,8,...,21,336,23495,23495,43370,"['wəts', 'hɪz', 'neɪm']",wəts hɪz neɪm,"[""what's"", 'his', 'name']",3,build basket comes
4,17305483,Gigi try,Gigi try,titi twaɪ,ʤiːʤiː tɹaɪ,declarative,eng,2,2,477,...,21,336,23471,23471,43246,"['ʤiʤi', 'tɹaɪ']",ʤiʤi tɹaɪ,"['gigi', 'try']",2,cat's dragon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,17352007,crack a bottle,crack a bottle,kʌk ə bɑbɑ,kɹæk ə bɑtəl,declarative,eng,3,3,298,...,21,336,23487,23487,43275,"['kɹæk', 'ə', 'bɑtəl']",kɹæk ə bɑtəl,"['crack', 'a', 'bottle']",3,henry's another others
496,17489283,it's Santa,it Santa,ɪd sæ̃ndɑ,ɪts sæntə,declarative,eng,3,2,588,...,21,336,23495,23495,43375,"['ɪts', 'sæntə']",ɪts sæntə,"[""it's"", 'santa']",2,friend's fits
497,17453154,with auntie,with aunt,wɪs ɑːn^tʰi,wɪθ æntiː,declarative,eng,3,2,109,...,21,336,23495,23495,43356,"['wɪθ', 'ænti']",wɪθ ænti,"['with', 'auntie']",2,bakery promise
498,17611350,what's Mommy doing,what Mommy do,wʌz mɑmi duɪn,wʌts mɑmiː duːɪŋ,question,eng,5,3,959,...,21,336,23510,23510,43447,"['wəts', 'mɑmi', 'duɪŋ']",wəts mɑmi duɪŋ,"[""what's"", 'mommy', 'doing']",3,barry lessons bull


In [None]:
final = new.copy()
final.to_csv('final_testing_sentences_sheets.csv')

In [None]:
# CODE FOR TESTING PREDICTION OF INTERPRETATIONS FOR ONE-WORD UTTERANCES

def trigrams_in_sent(sent):
    return [ngram for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def all_trigrams(sents):
    return [ngram for sent in sents for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def trigram_freq_dist(all_trigrams):
    return nltk.FreqDist(all_trigrams)

def kneser_ney_prob_dist(sents):
    trigram_list = all_trigrams(sents)
    fd = trigram_freq_dist(trigram_list)
    return KneserNeyProbDist(fd)

def kneser_ney_prob(trigram, kn_prob_dist):
    return kn_prob_dist.prob(trigram)

def process_word(word):
    new_word = word
    new_word = new_word.replace('[', '')
    new_word = new_word.replace(']', '')
    new_word = new_word.replace("'", "")
    new_word = new_word.replace(' ', '')
    new_word = new_word.replace('"', '')
    new_word = new_word.lower()
    return new_word

def create_kneser_ney_prob_dist():
    sents_for_dist = []
    with open ('complete_transcriptions.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                sent = row[29]
                split_sent = sent.split(",")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                sents_for_dist.append(split_sent_list)
            line_count += 1
    return kneser_ney_prob_dist(sents_for_dist)

def create_english_cmu_dict():
    english_cmu_dict = {}
    with open ('cmu_in_childes_22000.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                english_cmu_dict[row[1]] = row[8]
            line_count += 1
    return english_cmu_dict

def create_english_words(english_dict):
    return list(english_dict.keys())

def init_random_sentence(english_words, num_words):
    sent = []
    for i in range(num_words):
        new_word_ind = random.randint(0, len(english_words)-1)
        new_word = english_words[new_word_ind]
        sent.append(new_word)
    return sent

def perturb_sentence(english_words, sent):
    new_sent = sent.copy()
    num_words = len(sent)
    change_ind = random.randint(0, num_words-1)
    new_word_ind = random.randint(0, len(english_words)-1)
    new_word = english_words[new_word_ind]
    new_sent[change_ind] = new_word
    return new_sent

def english_to_ipa(english_dict, sent):
    ipa_sent = []
    for word in sent:
        ipa_sent.append(english_dict[word])
    return ipa_sent

def trigram_prior_for_sent(sent, kn_prob_dist):
    trigrams = trigrams_in_sent(sent)
    num_trigrams = len(trigrams)
    trigram_priors = np.zeros(num_trigrams)
    for i in range(num_trigrams):
        trigram_priors[i] = kneser_ney_prob(trigrams[i], kn_prob_dist)
    return np.prod(trigram_priors)

def calculate_likelihood(guess, actual, match_multiplier=0.99):
    matches = 0
    no_matches = 0
    for i in range(len(actual)):
        if guess[i] == actual[i]:
            matches += 1
        else:
            no_matches += 1
    no_match_multiplier = 1-match_multiplier
    likelihood = (match_multiplier**matches)*(no_match_multiplier**no_matches)
    return likelihood

def calculate_prob(english_dict, kn_prob_dist, guess, actual, match_multiplier=0.99):
    guess_ipa = english_to_ipa(english_dict, guess)
    return trigram_prior_for_sent(guess, kn_prob_dist)*calculate_likelihood(guess_ipa, actual, match_multiplier) + 0.00001

def find_n_most_common_transcriptions(n, plausible_transcriptions):
    new_list = []
    for transcription in plausible_transcriptions:
        new_list.append(' '.join(transcription))
    counter = Counter(new_list)
    print(counter.most_common(n))

def transcribe(actual):
    plausible_transcriptions = []
    num_words = len(actual)
    english_dict = create_english_cmu_dict()
    english_words = create_english_words(english_dict)
    kn_prob_dist = create_kneser_ney_prob_dist()
    old_guess = init_random_sentence(english_words, num_words)
    for i in range(1000000):
        new_guess = perturb_sentence(english_words, old_guess)
        old_guess_prob = calculate_prob(english_dict, kn_prob_dist, old_guess, actual)
        new_guess_prob = calculate_prob(english_dict, kn_prob_dist, new_guess, actual)
        if new_guess_prob/old_guess_prob > random.random():
            old_guess = new_guess
        plausible_transcriptions.append(old_guess)
    find_n_most_common_transcriptions(10, plausible_transcriptions)

In [None]:
# NEW PERTURBATION ATTEMPT (ANNIE)

def trigrams_in_sent(sent):
    return [ngram for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def all_trigrams(sents):
    return [ngram for sent in sents for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def trigram_freq_dist(all_trigrams):
    return nltk.FreqDist(all_trigrams)

def kneser_ney_prob_dist(sents):
    trigram_list = all_trigrams(sents)
    fd = trigram_freq_dist(trigram_list)
    return KneserNeyProbDist(fd)

def kneser_ney_prob(trigram, kn_prob_dist):
    return kn_prob_dist.prob(trigram)

def process_word(word):
    new_word = word
    new_word = new_word.replace('[', '')
    new_word = new_word.replace(']', '')
    new_word = new_word.replace("'", "")
    new_word = new_word.replace(' ', '')
    new_word = new_word.replace('"', '')
    new_word = new_word.lower()
    return new_word

def create_kneser_ney_prob_dist():
    sents_for_dist = []
    with open ('training_sentences.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                sent = row[29]
                split_sent = sent.split(",")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                sents_for_dist.append(split_sent_list)
            line_count += 1
    return kneser_ney_prob_dist(sents_for_dist)

def create_english_cmu_dict():
    english_cmu_dict = {}
    with open ('cmu_in_childes_small.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                english_cmu_dict[row[1]] = row[8]
            line_count += 1
    return english_cmu_dict

def create_english_words(english_dict):
    return list(english_dict.keys())

def init_random_sentence(english_words, num_words):
    sent = []
    for i in range(num_words):
        new_word_ind = random.randint(0, len(english_words)-1)
        new_word = english_words[new_word_ind]
        sent.append(new_word)
    return sent

def perturb_sentence(english_words, sent, matches_indices):
    new_sent = sent.copy()
    num_words = len(sent)
    # if matches_indices:
    #   print(matches_indices)

    if len(matches_indices) == num_words:
      return sent
    change_ind = (choice([i for i in range(0,num_words) if i not in matches_indices]))
    new_word_ind = random.randint(0, len(english_words)-1)
    new_word = english_words[new_word_ind]
    new_sent[change_ind] = new_word
    return new_sent

def english_to_ipa(english_dict, sent):
    ipa_sent = []
    for word in sent:
        ipa_sent.append(english_dict[word])
    return ipa_sent

def trigram_prior_for_sent(sent, kn_prob_dist):
    trigrams = trigrams_in_sent(sent)
    num_trigrams = len(trigrams)
    trigram_priors = np.zeros(num_trigrams)
    for i in range(num_trigrams):
        trigram_priors[i] = kneser_ney_prob(trigrams[i], kn_prob_dist)
    return np.prod(trigram_priors)

def calculate_likelihood(guess, actual, match_multiplier=0.99):
    matches = 0
    no_matches = 0
    matches_indices = []
    for i in range(len(actual)):
        if guess[i] == actual[i]:
            matches += 1
            matches_indices.append(i)
        else:
            no_matches += 1
    no_match_multiplier = 1-match_multiplier
    likelihood = (match_multiplier**matches)*(no_match_multiplier**no_matches)
    return likelihood, matches_indices

def calculate_prob(english_dict, kn_prob_dist, guess, actual, match_multiplier=0.99):
    guess_ipa = english_to_ipa(english_dict, guess)
    likelihood = calculate_likelihood(guess_ipa, actual, match_multiplier)
    prob = trigram_prior_for_sent(guess, kn_prob_dist)*likelihood[0] + 0.00001
    match_indices = likelihood[1]
    return prob, match_indices

def find_n_most_common_transcriptions(n, plausible_transcriptions):
    new_list = []
    for transcription in plausible_transcriptions:
        new_list.append(' '.join(transcription))
    counter = Counter(new_list)
    print(counter.most_common(n))

def transcribe(actual):
    plausible_transcriptions = []
    num_words = len(actual)
    english_dict = create_english_cmu_dict()
    english_words = create_english_words(english_dict)
    kn_prob_dist = create_kneser_ney_prob_dist()
    old_guess = init_random_sentence(english_words, num_words)
    matches_indices = []
    for i in range(100000):
        new_guess = perturb_sentence(english_words, old_guess, matches_indices)
        old_guess_prob = calculate_prob(english_dict, kn_prob_dist, old_guess, actual)[0]
        new_guess_prob, possible_matches_indices = calculate_prob(english_dict, kn_prob_dist, new_guess, actual)
        if new_guess_prob/old_guess_prob > random.random():
            old_guess = new_guess
            matches_indices = possible_matches_indices
        plausible_transcriptions.append(old_guess)
        # if len(old_guess) == len(matches_indices):
        #     return old_guess
    find_n_most_common_transcriptions(10, plausible_transcriptions)

def get_all_transcriptions():
    all_transcriptions = []
    with open ('testing_sentences.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0 and line_count < 50:
                sent = row[1]
                split_sent = sent.split(" ")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                all_transcriptions.append(split_sent_list)
            line_count += 1
    return all_transcriptions

In [None]:
# NEW PERTURBATION ATTEMPT (DIANA)

def trigrams_in_sent(sent):
    return [ngram for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def all_trigrams(sents):
    return [ngram for sent in sents for ngram in ngrams(sent, 3, pad_left=True, pad_right=True)]

def trigram_freq_dist(all_trigrams):
    return nltk.FreqDist(all_trigrams)

def kneser_ney_prob_dist(sents):
    trigram_list = all_trigrams(sents)
    fd = trigram_freq_dist(trigram_list)
    return KneserNeyProbDist(fd)

def kneser_ney_prob(trigram, kn_prob_dist):
    return kn_prob_dist.prob(trigram)

def process_word(word):
    new_word = word
    new_word = new_word.replace('[', '')
    new_word = new_word.replace(']', '')
    new_word = new_word.replace("'", "")
    new_word = new_word.replace(' ', '')
    new_word = new_word.replace('"', '')
    new_word = new_word.lower()
    return new_word

def create_kneser_ney_prob_dist():
    sents_for_dist = []
    with open ('complete_transcriptions.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                sent = row[29]
                split_sent = sent.split(",")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                sents_for_dist.append(split_sent_list)
            line_count += 1
    return kneser_ney_prob_dist(sents_for_dist)

def create_english_cmu_dict():
    english_cmu_dict = {}
    with open ('cmu_in_childes_22000.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                english_cmu_dict[row[1]] = row[8]
            line_count += 1
    return english_cmu_dict

def create_english_words():
    english_words = []
    with open ('cmu_in_childes_22000.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                english_words.append(row[1])
            line_count += 1
    return english_words

def init_random_sentence(english_words, num_words):
    sent = []
    for i in range(num_words):
        new_word_ind = random.randint(0, len(english_words)-1)
        new_word = english_words[new_word_ind]
        sent.append(new_word)
    return sent

def determine_dict_probabilities(english_words):
    total_count = 0
    num_english_words = len(english_words)
    word_dist = np.zeros(num_english_words)
    with open ('cmu_in_childes_22000.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                count = int(row[9])
                total_count += count
                word_dist[line_count-1] = row[9]
            line_count += 1
    divisor = np.full((num_english_words), total_count)
    return np.divide(word_dist, divisor)
    

def perturb_sentence(english_words, word_dist, sent):
    new_sent = sent.copy()
    num_words = len(sent)
    change_ind = random.randint(0, num_words-1)
    #new_word_ind = random.randint(0, len(english_words)-1)
    #new_word = english_words[new_word_ind]
    rng = default_rng()
    new_word_ind = rng.choice(len(english_words), size=1, p=word_dist)
    new_word = english_words[int(new_word_ind)]
    new_sent[change_ind] = new_word
    return new_sent

def english_to_ipa(english_dict, sent):
    ipa_sent = []
    for word in sent:
        ipa_sent.append(english_dict[word])
    return ipa_sent

def trigram_prior_for_sent(sent, kn_prob_dist):
    trigrams = trigrams_in_sent(sent)
    num_trigrams = len(trigrams)
    trigram_priors = np.zeros(num_trigrams)
    for i in range(num_trigrams):
        trigram_priors[i] = kneser_ney_prob(trigrams[i], kn_prob_dist)
    return np.prod(trigram_priors)

def calculate_likelihood(guess, actual, match_multiplier=0.99):
    matches = 0
    no_matches = 0
    for i in range(len(actual)):
        if guess[i] == actual[i]:
            matches += 1
        else:
            no_matches += 1
    no_match_multiplier = 1-match_multiplier
    likelihood = (match_multiplier**matches)*(no_match_multiplier**no_matches)
    return likelihood

def calculate_prob(english_dict, kn_prob_dist, guess, actual, match_multiplier=0.99):
    guess_ipa = english_to_ipa(english_dict, guess)
    return trigram_prior_for_sent(guess, kn_prob_dist)*calculate_likelihood(guess_ipa, actual, match_multiplier) + 0.00001

def find_n_most_common_transcriptions(n, plausible_transcriptions):
    new_list = []
    for transcription in plausible_transcriptions:
        new_list.append(' '.join(transcription))
    counter = Counter(new_list)
    print(counter.most_common(n))

def transcribe(actual):
    plausible_transcriptions = []
    num_words = len(actual)
    english_dict = create_english_cmu_dict()
    english_words = create_english_words()
    word_dist = determine_dict_probabilities(english_words)
    kn_prob_dist = create_kneser_ney_prob_dist()
    old_guess = init_random_sentence(english_words, num_words)
    for i in range(1000000):
        new_guess = perturb_sentence(english_words, word_dist, old_guess)
        old_guess_prob = calculate_prob(english_dict, kn_prob_dist, old_guess, actual)
        new_guess_prob = calculate_prob(english_dict, kn_prob_dist, new_guess, actual)
        if new_guess_prob/old_guess_prob > random.random():
            old_guess = new_guess
        plausible_transcriptions.append(old_guess)
    find_n_most_common_transcriptions(10, plausible_transcriptions)

def get_all_transcriptions():
    all_transcriptions = []
    with open ('complete_transcriptions.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count != 0 and line_count < 50:
                sent = row[1]
                split_sent = sent.split(" ")
                split_sent_list = []
                for word in split_sent:
                    processed_word = process_word(word)
                    split_sent_list.append(processed_word)
                all_transcriptions.append(split_sent_list)
            line_count += 1
    return all_transcriptions