In [69]:
from collections import defaultdict

In [70]:
languages = [
    "English-Latin1",
    "Italian-Latin1",
    "German_Deutsch-Latin1",
    "Spanish-Latin1", 
    "Chinese_Mandarin-GB2312",
    "Ukrainian-UTF8" ]

In [71]:
from nltk.corpus import udhr

In [72]:
udhr.raw('Chinese_Mandarin-GB2312')[:100]

u'\u4e16\u754c\u4eba\u6743\u5ba3\u8a00\n\u8054\u5408\u56fd\u5927\u4f1a\u4e00\u4e5d\u56db\u516b\u5e74\u5341\u4e8c\u6708\u5341\u65e5\u7b2c217A(III)\u53f7\u51b3\u8bae\u901a\u8fc7\u5e76\u9881\u5e03\n\n1948 \u5e74 12 \u6708 10 \u65e5\uff0c \u8054 \u5408 \u56fd \u5927 \u4f1a \u901a \u8fc7 \u5e76 \u9881 \u5e03\u300a \u4e16 \u754c \u4eba \u6743 \u5ba3 \u8a00\u300b\u3002 \u8fd9 \u4e00 \u5177'

In [73]:
# Kendall tau distance
def distance(cfr1, cfr2, chars):
    total = 0
    for c1 in chars:
        for c2 in chars:          
            if (c1 in cfr1) and (c2 in cfr1) and (c1 in cfr2) and (c2 in cfr2):
                if ((cfr1[c1] < cfr1[c2]) != (cfr2[c1] < cfr2[c2])):
                    total += 1
            else:
                total += 1    
    return total

def text_filter(text):
    for ch in " \n\t\r.,:;[]()\\/\"0123456789~!`?@#$%^&*_-=-":
        text = text.replace(ch, "")
    return text

def gen_ranked_cfd(sorted_cfd):
    chars_freq_rank = defaultdict(int)    
    for rank in range(len(sorted_cfd)):
        char = sorted_cfd[rank]
        chars_freq_rank[char] = rank + 1
    return chars_freq_rank

def gen_sorted_cfd(cfd):
    sorted_cfd = sorted(cfd, key=cfd.get, reverse=True)
    return sorted_cfd

def gen_cfd(text):
    chars_freq_dist = defaultdict(int)
    for char in text:
        chars_freq_dist[char] += 1
    return chars_freq_dist

def guess(text):
    text = text_filter(text)
    mystery_cfd = gen_cfd(text.lower().strip())
    mystery_cfs = gen_sorted_cfd(mystery_cfd)
    mystery_cfr = gen_ranked_cfd(mystery_cfs)
    min_dist = 2**16
    candidate_lang = ""
    for lang in languages:
        dist = distance(mystery_cfr, cfr[lang], chars)   
        if dist < min_dist:
            min_dist = dist
            candidate_lang = lang
    return candidate_lang

In [11]:
cfd = {} # character frequency distribution
cfs = {} # sorted character based on frequency distribution
cfr = {} # character frequency distribution rankings

chars = set() # union of all top n characters
n = 30 # number of top n chars

In [66]:
for lang in languages:
    udhr_text = text_filter(udhr.raw(lang).lower())
    cfd[lang] = gen_cfd(udhr_text)
    cfs[lang] = gen_sorted_cfd(cfd[lang])
    cfr[lang] = gen_ranked_cfd(cfs[lang])
    top_n_chars = cfs[lang][:n]
    chars = chars.union(set(top_n_chars))



In [56]:
dict(cfr[languages[2]])

{u'a': 7,
 u'b': 17,
 u'c': 13,
 u'd': 6,
 u'e': 1,
 u'f': 14,
 u'g': 12,
 u'h': 8,
 u'i': 4,
 u'j': 24,
 u'k': 18,
 u'l': 11,
 u'm': 16,
 u'n': 2,
 u'o': 15,
 u'p': 25,
 u'r': 3,
 u's': 9,
 u't': 5,
 u'u': 10,
 u'v': 21,
 u'w': 20,
 u'x': 29,
 u'y': 28,
 u'z': 19,
 u'\xdf': 27,
 u'\xe4': 22,
 u'\xf6': 26,
 u'\xfc': 23}

In [57]:
english_text = u"""The Kendall tau distance is a metric that counts the number of
pairwise disagreements between two lists. The larger the distance,
the more dissimilar the two lists are. Kendall tau distance is also
called bubble-sort distance since it is equivalent to the number of
swaps that the bubble sort algorithm would make to place one list in
the same order as the other list. The Kendall tau distance was
created by Maurice Kendall."""

spanish_text = u"""Proclama
la presente Declaración Universal de Derechos Humanos como ideal común
por el que todos los pueblos y naciones deben esforzarse, a fin de que
tanto los individuos como las instituciones, inspirándose
constantemente en ella, promuevan, mediante la enseñanza y la
educación, el respeto a estos derechos y libertades, y aseguren, por
medidas progresivas de carácter nacional e internacional, su
reconocimiento y aplicación universales y efectivos, tanto entre los
pueblos de los Estados Miembros como entre los de los territorios
colocados bajo su jurisdicción. """

chinese_text = u"""因 此 现 在, 大 会, 发 布 这 一 世 界 人 权 宣 言
, 作 为 所 有 人 民 和 所 有 国 家 努 力 实 现 的 共 同 标 准, 以 期 每 一 个 人 和 社 会 机 构 经 常 铭
念 本 宣 言, 努 力 通 过 教 诲 和 教 育 促 进 对 权 利 和 自 由 的 尊 重, 并 通 过 国 家 的 和 国 际 的 渐
进 措 施, 使 这 些 权 利 和 自 由 在 各 会 员 国 本 身 人 民 及 在 其 管 辖 下 领 土 的 人 民 中 得 到 普
遍 和 有 效 的 承 认 和 遵 行"""

dante_text = u"""Nel mezzo del cammin di nostra vita
mi ritrovai per una selva oscura
che la diritta via era smarrita.
Ahi quanto a dir qual era e cosa dura
esta selva selvaggia e aspra e forte
che nel pensier rinova la paura!
Tant'e amara che poco e piu morte;
ma per trattar del ben ch'i' vi trovai,
diro de l'altre cose ch'i' v'ho scorte.
Io non so ben ridir com'i' v'intrai,
tant'era pien di sonno a quel punto
che la verace via abbandonai."""

In [67]:
print guess(chinese_text)

Chinese_Mandarin-GB2312


In [59]:
import urllib2
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

In [74]:
url = 'http://it.wikipedia.org/wiki/Monica_Bellucci'

def get_text(url):
    content = urllib2.urlopen(url)
    soup = BeautifulSoup(content, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) 
    body = soup.find('body')        
    text = [t for p in body.findAll('p') for t in p.findAll(text=True)]
    return " ".join(text)

print guess(get_text(url))

Italian-Latin1




In [75]:
urls = [
    'http://it.wikipedia.org/wiki/Monica_Bellucci',
    'http://es.wikipedia.org/wiki/Pen%C3%A9lope_Cruz',
    'http://uk.wikipedia.org/wiki/%D0%9A%D1%83%D1%80%D0%B8%D0%BB%D0%B5%D0%BD%D0%BA%D0%BE_%D0%9E%D0%BB%D1%8C%D0%B3%D0%B0_%D0%9A%D0%BE%D1%81%D1%82%D1%8F%D0%BD%D1%82%D0%B8%D0%BD%D1%96%D0%B2%D0%BD%D0%B0',
    'http://zh.wikipedia.org/wiki/%E7%AB%A0%E5%AD%90%E6%80%A1',
    'http://en.wikipedia.org/wiki/Megan_Fox' ]

for url in urls:
    print guess(get_text(url))

Italian-Latin1
Spanish-Latin1
Ukrainian-UTF8
Chinese_Mandarin-GB2312
English-Latin1


