<h2>Library for language detection</h2>

*`By Fábio Bif Goularte`
(fabio.goularte@gmail.com)*

<h4>Import modules</h4>

In [12]:
import collections
import math
import os
import pandas as pd
import numpy as np

<h4>Detecting the language</h4>

Set the language of the document to detecting (line in[6]) with a code from the table below (column Language code).

| Language | Language code | Language | Language code |
| :--- | :--- | :--- | :--- |
| Afrikaans | af | Italian | it |
| German | de | Japanese | ja |
| English | en | Korean | ko |
| Spanish | es | Portuguese (Brazil) | pt-BR |
| Hindi | hi |  Chinese, Mandarin (Simplified) | zh-Hans |

`Note:` It is possible to check other languages from those listed in the table above. Thus, provide a document in the desired language and run the Training Testing file to create the test document.

In [18]:
#Code of language tested, e.g. 'pt-Br' to Portuguese
languageDoc = 'pt-BR'

#Load the n-gram model selected to test the language document (testing folder)
df_test = pd.read_json('testing/'+languageDoc+'.json', orient='columns')
df_test.columns=['n-gramas','freq_doc']

languages = os.listdir('testing')

#Load the n-grams models used to train the classifier according to the files in testing (training folder)
for langTrain in languages:
    langTrain  = langTrain.split('.')
    df_train   = pd.read_json('training/'+langTrain[0]+'.json', orient='columns')
    df_test    = df_test.merge(df_train,how='left',on='n-gramas')
    print(df_test)
    


      freq_af n-gramas
0        1686        e
1         795        n
10        371        g
100        19       pe
1000        1      els
1001        1      atv
1002        1      old
1003        1      oen
1004        1      dei
1005        1      svi
1006        1      dhe
1007        1      idw
1008        1      ynv
1009        1      yde
101        19       ne
1010        1      tir
1011        1      vid
1012        1      iei
1013        1      skl
1014        1      uis
1015        1      med
1016        1      ldt
1017        1      e‐o
1018        1      fbu
1019        1      ‐eg
102        19      and
1020        1      onl
1021        1      iko
1022        1      pme
1023        1      ikb
...       ...      ...
972         1      kom
973         1      sed
974         1      syn
975         1      ’nd
976         1      sse
977         1      sra
978         1      amh
979         1      ‐sa
98         19       nt
980         1      wed
981         1      apo
982        

<h4>Shows the n-gram models based on the documents in the testing folder</h4>

In [14]:
df_test.head()

Unnamed: 0,n-gramas,freq_doc,freq_af,freq_de,freq_en,freq_es,freq_hi,freq_it,freq_ja,freq_ko,freq_pt-BR,freq_zh-Hans
0,445,a,,,,,,,,,,
1,391,e,,,,,,,,,,
2,150,t,,,,,,,,,,
3,8,x,,,,,,,,,,
4,1,nos,,,,,,,,,,


In [9]:
df_new=df_test.copy()

<h4>TF-IDF</h4>

In [10]:
coll = list(df_test.columns) 
coll.pop(0)

#Calculating TF-IDF per n-grams on the selected test document and on the documents from training folder
for w in coll:
    df_new[w]=df_test[w]/len(df_test)*np.log10(len(coll)/df_test.count(axis='columns'))

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
df_new.head()

<h4>Function used to calculate the cosine similarity</h4>

In [10]:
#Calculating cosine similarity
def similarity(docA,docB):
    numerator = 0
    docA      = docA.replace(np.nan,0)
    docB      = docB.replace(np.nan,0)
    den_1     = math.sqrt(sum([docA[i]**2 for i in range(0,len(docA))]))
    den_2     = math.sqrt(sum([docB[i]**2 for i in range(0,len(docB))]))
    
    for i in range(0,len(docA)): numerator=numerator+docA[i]*docB[i]
    
    denumerator = den_1*den_2
    if denumerator == 0 : denumerator = 0.0001
    
    return numerator/denumerator    

In [11]:
coll.pop(0)
classifier = {}

#Calculating the cosine similarity between 'languageDoc' and the other documents 
for w in coll:
    classifier[w] = similarity(df_new['freq_doc'],df_new[w])

<h4>Shown the results</h4>

In [12]:
print('Doc tested: '+languageDoc)
print('\nClassification by similarity:')

for key, value in sorted(classifier.items(), key=lambda item: item[1], reverse=True):
    print("%s: %s" % (key, value))

Doc tested: pt-BR

Classification by similarity:
freq_pt-BR: 0.9512008768057516
freq_es: 0.9180541189217364
freq_it: 0.8732348939396601
freq_en: 0.8575734253085008
freq_de: 0.7841373356480694
freq_af: 0.7555665053842838
freq_zh-Hans: 0.21549506976710367
freq_hi: 0.0
freq_ja: 0.0
freq_ko: 0.0
