In [16]:
import pandas as pd

In [22]:
lf_df = pd.read_csv("ordered-letter-sequences.csv", skiprows = 0, sep=',')
lf_df.head(26)

Unnamed: 0,language,ordered_letters
0,english,etaoinsrhldcumfpgwybvkxjqz
1,spanish,eaosrnidlctumpbgyívqóhfzjéáñxúüwk
2,german,enisratdhulcgmobwfkzvüpäßjöyqx
3,french,esaitnrulodcmpévqfbghjàxèyêzçôùâûî
4,italian,eaionlrtscdupmvghfbqzòàùì
5,dutch,enatirodslghvkmubpwjczfxy
6,turkish,aeinrlıdkmuytsboüşzgçhğvcöpfjwxq
7,polish,iaeoznscrwyłdkmtpujlgębąhżśóćńfźvqx
8,esperanto,aieonlsrtkjudmpvgfbcĝĉŭzŝhĵĥwyxq
9,swedish,eantrslidomgkvähfupåöbcjyxwzéq


In [7]:
def process_file(textfile):
    with open(textfile, encoding="utf8") as myfile:
        content = myfile.readlines()
    
    all_letters ='esaitnrulodcmpévqfbghjàxèyêzçôùâûîøöœwkäßïëüæñ'
    # initialize the dict with ordered entries for all letters, with each a value initialized to 0
    dic ={letter: 0 for letter in all_letters}
    total = 0
    for line in content:
        for letter in line:
            letter = letter.lower()
            if letter in all_letters:
                total += 1
                if letter in dic: dic[letter] += 1
                else: dic[letter] = 0

    # normalize
    for letter in dic:
        dic[letter] = dic[letter] / total
    return dic

textfile='eng.txt'

text_lf_dict = process_file(textfile)

In [8]:
text_lf = pd.DataFrame.from_dict(text_lf_dict, orient='index', columns=['frequency'])
text_lf['letter'] = text_lf.index
text_lf.head(10)  

Unnamed: 0,frequency,letter
e,0.122573,e
s,0.06973,s
a,0.087707,a
i,0.080617,i
t,0.094401,t
n,0.07101,n
r,0.064332,r
u,0.028554,u
l,0.042,l
o,0.071349,o


In [9]:
''.join(text_lf[text_lf['frequency']>0].sort_values(by=['frequency'], ascending=False)['letter'])

'etaionsrlhcdumpfgwbvykxqzj'

In [10]:
# a function to calculate the Levenshtein distance matrix for two sequences
# https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/ 
import numpy as np

def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    # print (matrix)
    return (matrix[size_x - 1, size_y - 1])

In [11]:
print(levenshtein('bright','bright'))
print(levenshtein('bright','freight'))
print(levenshtein('bright','sleight'))
print(levenshtein('bright','bride'))
print(levenshtein('bright','plight'))
print(levenshtein('bright','pride'))
print(levenshtein('bright','donald duck'))

0.0
2.0
3.0
3.0
2.0
4.0
11.0


In [23]:
document_letter_sequence = ''.join(text_lf[text_lf['frequency']>0].sort_values(by=['frequency'], ascending=False)['letter'])
# loop over the letter sequences in lf_df - for each language, determine levenshtein distance with document_letter_sequence
best_score = 999
best_matching_language = None
for index, row in lf_df.iterrows():
    ld = levenshtein(document_letter_sequence,row['ordered_letters'])
    print(row['language'],': ',ld)
    if ld < best_score:
        best_score= ld
        best_matching_language = row['language']
print("We have a winner: ",best_matching_language)      

english :  11.0
spanish :  23.0
german :  25.0
french :  24.0
italian :  16.0
dutch :  19.0
turkish :  28.0
polish :  26.0
esperanto :  24.0
swedish :  23.0
danish :  31.0
portuguese :  29.0
norwegian :  22.0
icelandic :  32.0
hungarian :  30.0
slovak :  22.0
finnish :  22.0
czech :  34.0
hawaiian :  21.0
maori :  22.0
latin :  20.0
irish :  21.0
welsh :  26.0
gaelic :  23.0
japanese :  26.0
chinese :  26.0
We have a winner:  english


In [24]:
def inspect_file(textfilename):
      text_lf_dict = process_file(textfilename)
      text_lf = pd.DataFrame.from_dict(text_lf_dict, orient='index', columns=['frequency'])
      text_lf['letter'] = text_lf.index
      document_letter_sequence = ''.join(text_lf[text_lf['frequency']>0].sort_values(by=['frequency'], ascending=False)['letter'])
      print(document_letter_sequence)
      # loop over the letter sequences in lf_df - for each language, determine levenshtein distance with document_letter_sequence
      best_score = 999
      best_matching_language = None
      for index, row in lf_df.iterrows():        
           ld = levenshtein(document_letter_sequence,row['ordered_letters'])
           print(row['language'],': ',ld)
           if ld == best_score:
               best_matching_language = best_matching_language + ', '+row['language']
           if ld < best_score:
               best_score= ld
               best_matching_language = row['language']
      print("We have a winner: ",best_matching_language) 
    
inspect_file('dictionary-de.txt')    

enirtsalhuogkmcbdfpzävwüöyxjqßéñ
english :  24.0
spanish :  29.0
german :  20.0
french :  29.0
italian :  25.0
dutch :  24.0
turkish :  28.0
polish :  32.0
esperanto :  27.0
swedish :  22.0
danish :  31.0
portuguese :  32.0
norwegian :  25.0
icelandic :  32.0
hungarian :  31.0
slovak :  24.0
finnish :  23.0
czech :  33.0
hawaiian :  27.0
maori :  28.0
latin :  23.0
irish :  26.0
welsh :  28.0
gaelic :  26.0
japanese :  32.0
chinese :  32.0
We have a winner:  german


In [25]:
inspect_file('sub001.txt')
#Cannot detect Japanese or any asian

slok
english :  23.0
spanish :  30.0
german :  26.0
french :  31.0
italian :  24.0
dutch :  22.0
turkish :  30.0
polish :  33.0
esperanto :  30.0
swedish :  26.0
danish :  36.0
portuguese :  34.0
norwegian :  26.0
icelandic :  35.0
hungarian :  34.0
slovak :  21.0
finnish :  25.0
czech :  38.0
hawaiian :  17.0
maori :  13.0
latin :  20.0
irish :  27.0
welsh :  31.0
gaelic :  24.0
japanese :  20.0
chinese :  20.0
We have a winner:  maori


In [27]:
query1 = '你好世界'

query2 = '你好'

In [32]:
print(u'哈哈'.encode('utf-8'))

b'\xe5\x93\x88\xe5\x93\x88'


In [42]:
import re

In [47]:
def cjk_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return "korean"
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return "japanese"
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return "chinese"
    return None


In [50]:
texts = "日産自動車、営業益45%減　前期下方修正"
print(cjk_detect(texts))
# Traditional Chinese with Japanese hiragana
texts = "健康の油切 好吃の涼麵"
print(cjk_detect(texts))
# Traditional Chinese with Japanese katakana punctuation
texts = "鐵腕・都鐸王朝（五）：文藝復興最懂穿搭的高富帥——亨利八世"
print(cjk_detect(texts))
texts = "이건 한국어가 아니야"
print(cjk_detect(texts))
str = open('sub001.txt', 'r', encoding="utf8").read()
print(cjk_detect(str))

chinese
japanese
japanese
korean
japanese
