In [1]:
import re
import string
import nltk
import math

In [2]:
s = "I am a boy"
nltk.word_tokenize(s)

['I', 'am', 'a', 'boy']

In [3]:
def pre_process_text(text):
    text = text.lower()
    text = re.sub('['+string.punctuation+']', ' ', text)
    text = re.sub('[0-9]','',text)
    return text

In [4]:
def token_dict(file_name):
    dict1 = {}
    with open(file_name,encoding="utf-8") as text:
        for line in text:
            # We do not care for case, so move to lower case, and also remove all punctuations and all numbers
            line = pre_process_text(line)
            # Tokenize the text using NLTK's word_tokenize
            temp = nltk.word_tokenize(line)
            #num_tokens += len(temp)
            for word in temp:
                if word not in dict1:
                    dict1[word] = 1
                else:
                    dict1[word] += 1
    return dict1

In [5]:
# The Holy Bible in English
en_dict1 = token_dict("English.txt")
print('Vocab length of this English Text:',len(en_dict1))

Vocab length of this English Text: 12497


In [6]:
# We do the same as above for German translation of The Holy Bible
de_dict = token_dict("German.txt")
print("Vocab length in German text:",len(de_dict))

Vocab length in German text: 20310


In [7]:
def calculate_entropy(lang_dict):
    sum_ = 0
    num_toks = 0
    # Counting the total tokens
    for token in lang_dict:
        num_toks += lang_dict[token]
    # Calculate Entropy
    for token in lang_dict:
        prob = lang_dict[token] / num_toks
        sum_ += -(prob) * math.log2(prob)
    return sum_

In [8]:
entropy_eng1 = calculate_entropy(en_dict1)
print("Entropy is",entropy_eng1,"bits")

Entropy is 8.658593690721707 bits


In [9]:
entropy_de = calculate_entropy(de_dict)
print("Entropy is",entropy_de,"bits")

Entropy is 9.600328729633267 bits


In [10]:
# Treasure Island text
en_dict2 = token_dict("treasure_island.txt")
print('Vocab length of this English Text:',len(en_dict2))

Vocab length of this English Text: 5840


In [11]:
entropy_eng2 = calculate_entropy(en_dict2)
print("Entropy is",entropy_eng2,"bits")

Entropy is 9.053948694862521 bits


In [12]:
def lidstone_smoothing(keys,lang_dict,alpha=0.1):
    #sum_ = 0
    num_toks = 0
    prob_dict = {}
    # Counting the total tokens
    for token in lang_dict:
        num_toks += lang_dict[token]
    v = len(keys)
    for token in keys:
        if token in lang_dict.keys():
            x = lang_dict[token]
        else:
            x = 0
        prob_dict[token] = (x + alpha) / (num_toks + (alpha * v))
    return prob_dict

In [13]:
def kl_divergence(text1,text2):
    dict1 = token_dict(text1)
    dict2 = token_dict(text2)
    keys = dict1.keys()
    s_dict1 = lidstone_smoothing(keys, dict1)
    s_dict2 = lidstone_smoothing(keys, dict2)
    sum_ = 0
    for token in keys:
        sum_ += s_dict1[token] * math.log2(s_dict1[token] / s_dict2[token])
    return sum_

In [14]:
# English to German
text1 = "English.txt"
text2 = "German.txt"
print(kl_divergence(text1,text2))

12.602883864092231


In [15]:
# German to English
text1 = "German.txt"
text2 = "English.txt"
print(kl_divergence(text1,text2))

11.867298419162317


In [16]:
text1 = "English.txt"
text2 = "treasure_island.txt"
print(kl_divergence(text1, text2))

1.8667675782720918


In [17]:
text1 = "treasure_island.txt"
text2 = "English.txt"
print(kl_divergence(text1, text2))

2.074006389589388


In [18]:
text1 = "treasure_island.txt"
text2 = "German.txt"
print(kl_divergence(text1, text2))

12.276553588895016


In [19]:
text1 = "German.txt"
text2 = "treasure_island.txt"
print(kl_divergence(text1, text2))

9.063221605075231
