In [1]:
from math import log2
import numpy as np
import pandas as pd

In [2]:
root_path = './data/lesson01/'
extension = '.txt'

def read_file(name):
    file = open(root_path + name + extension, encoding='utf8')
    text = file.read()
    file.close()
    
    return text.lower()
    
cz = read_file('czech')
en = read_file('english')

In [3]:
def get_character_probabilities(text):
    probs = dict([(char, 0) for char in set(text)])
    
    for char in text:
        probs[char] += 1
        
    N = len(text)
    for char in probs:
        probs[char] /= N
    
    return probs

probs_cz = get_character_probabilities(cz)
probs_en = get_character_probabilities(en)

In [4]:
def get_entropy(probabilities):
    return sum([-(p * log2(p)) for p in probabilities.values()])

get_entropy(probs_cz)

4.837293313173991

In [5]:
def get_similarity(probs1, probs2):
    all_chars = set(probs1).union(set(probs2))
    
    s = 0
    for char in all_chars:
        p1 = 0
        if char in probs1:
            p1 = probs1[char]
            
        p2 = 0 # q
        if char in probs2:
            p2 = probs2[char]
            
        s += abs(p1-p2)

    return s

get_similarity(probs_cz, probs_en)

0.39720266896567047

In [6]:
def do_all(name):
    text = read_file(name)
    probs = get_character_probabilities(text)
    entropy = get_entropy(probs)
    
    print(name, 'entropy:', entropy)

do_all('czech')
do_all('german')
do_all('english')
do_all('french')
do_all('hungarian')

czech entropy: 4.837293313173991
german entropy: 4.463335366955405
english entropy: 4.237850921075281
french entropy: 4.509041198519109
hungarian entropy: 4.6839356222824104


### similarity

In [7]:
files = ['czech', 'german', 'english','french','hungarian']
probs = [get_character_probabilities(read_file(f)) for f in files]

sim = np.zeros((len(files), len(files)))

for i, p1 in enumerate(probs):
    for j, p2 in enumerate(probs):
        sim[i,j] = get_similarity(p1,p2)

In [8]:
pd.DataFrame(sim, index=files, columns=files)

Unnamed: 0,czech,german,english,french,hungarian
czech,0.0,0.482689,0.397203,0.405121,0.413543
german,0.482689,0.0,0.358214,0.322057,0.507416
english,0.397203,0.358214,0.0,0.383859,0.454767
french,0.405121,0.322057,0.383859,0.0,0.458979
hungarian,0.413543,0.507416,0.454767,0.458979,0.0
