In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from letter_frequency_lang_detection_utils import *

## Reading the characters frequency data by language

Character frequency by language table available in: [Wikipedia](https://en.wikipedia.org/wiki/Letter_frequency)

In [2]:
characters_freq_path = './languages_letter_frequencies/' + \
                       'languages_letter_frequencies.html'

df_freq_raw = pd.read_html(characters_freq_path, encoding='utf-8')[0]
df_freq_raw.shape

(87, 17)

In [3]:
df_freq_raw.columns = [col.lower().split('[')[0] for col in df_freq_raw.columns]
df_freq_raw = df_freq_raw.set_index('letter', drop=True)
df_freq_raw.head()

Unnamed: 0_level_0,english,french,german,spanish,portuguese,esperanto,italian,turkish,swedish,polish,dutch,danish,icelandic,finnish,czech,hungarian
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
a,8.167%,7.636%,6.516%,11.525%,14.634%,12.117%,11.745%,11.920%,9.383%,8.965%,7.49%,6.025%,10.110%,12.217%,8.421%,10.778%
b,1.492%,0.901%,1.886%,2.215%,1.043%,0.980%,0.927%,2.844%,1.535%,1.482%,1.58%,2.000%,1.043%,0.281%,0.822%,2.647%
c,2.782%,3.260%,2.732%,4.019%,3.882%,0.776%,4.501%,0.963%,1.486%,3.988%,1.24%,0.565%,~0%,0.281%,0.740%,0.924%
d,4.253%,3.669%,5.076%,5.010%,4.992%,3.044%,3.736%,4.706%,4.702%,3.293%,5.93%,5.858%,1.575%,1.043%,3.475%,2.410%
e,12.702%,14.715%,16.396%,12.181%,12.570%,8.995%,11.792%,8.912%,10.149%,7.921%,18.91%,15.453%,6.418%,7.968%,7.562%,11.926%


In [4]:
# preparing the data 

df_freq_raw = df_freq_raw.T
df_freq_raw['language'] = df_freq_raw.index
df_freq_raw = df_freq_raw.reset_index(drop=True)

df_freq_raw.head()

letter,a,b,c,d,e,f,g,h,i,j,...,û,ŭ,ü,ű,ů,ý,ź,ż,ž,language
0,8.167%,1.492%,2.782%,4.253%,12.702%,2.228%,2.015%,6.094%,6.966%,0.153%,...,~0%,0,~0%,0,0,0,0,0,0,english
1,7.636%,0.901%,3.260%,3.669%,14.715%,1.066%,0.866%,0.737%,7.529%,0.613%,...,0.060%,0,0,0,0,0,0,0,0,french
2,6.516%,1.886%,2.732%,5.076%,16.396%,1.656%,3.009%,4.577%,6.550%,0.268%,...,0,0,0.995%,0,0,0,0,0,0,german
3,11.525%,2.215%,4.019%,5.010%,12.181%,0.692%,1.768%,0.703%,6.247%,0.493%,...,0,0,0.012%,0,0,~0%,0,0,0,spanish
4,14.634%,1.043%,3.882%,4.992%,12.570%,1.023%,1.303%,0.781%,6.186%,0.397%,...,0,0,0.026%,0,0,0,0,0,0,portuguese


In [5]:
# parsing data to float
df_freq = df_freq_raw.drop('language', axis=1).applymap(
    lambda x: get_percentage(x)
)

df_freq['language'] = df_freq_raw['language']

In [6]:
df_freq.head()

letter,a,b,c,d,e,f,g,h,i,j,...,û,ŭ,ü,ű,ů,ý,ź,ż,ž,language
0,0.08167,0.01492,0.02782,0.04253,0.12702,0.02228,0.02015,0.06094,0.06966,0.00153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,english
1,0.07636,0.00901,0.0326,0.03669,0.14715,0.01066,0.00866,0.00737,0.07529,0.00613,...,0.0006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,french
2,0.06516,0.01886,0.02732,0.05076,0.16396,0.01656,0.03009,0.04577,0.0655,0.00268,...,0.0,0.0,0.00995,0.0,0.0,0.0,0.0,0.0,0.0,german
3,0.11525,0.02215,0.04019,0.0501,0.12181,0.00692,0.01768,0.00703,0.06247,0.00493,...,0.0,0.0,0.00012,0.0,0.0,0.0,0.0,0.0,0.0,spanish
4,0.14634,0.01043,0.03882,0.04992,0.1257,0.01023,0.01303,0.00781,0.06186,0.00397,...,0.0,0.0,0.00026,0.0,0.0,0.0,0.0,0.0,0.0,portuguese


In [7]:
df_freq.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â',
       'á', 'å', 'ä', 'ã', 'ą', 'æ', 'œ', 'ç', 'ĉ', 'ć', 'č', 'ď', 'ð', 'è',
       'é', 'ê', 'ë', 'ę', 'ě', 'ĝ', 'ğ', 'ĥ', 'î', 'ì', 'í', 'ï', 'ı', 'ĵ',
       'ł', 'ľ', 'ñ', 'ń', 'ň', 'ò', 'ö', 'ô', 'ó', 'ő', 'õ', 'ø', 'ř', 'ŝ',
       'ş', 'ś', 'š', 'ß', 'ť', 'þ', 'ù', 'ú', 'û', 'ŭ', 'ü', 'ű', 'ů', 'ý',
       'ź', 'ż', 'ž', 'language'],
      dtype='object', name='letter')

In [8]:
# removing not latin characters

df_freq = df_freq.drop([
    'à', 'â', 'á', 'å', 'ä', 'ã', 'ą', 'æ', 'œ', 'ç', 'ĉ', 'ć', 'č', 'ď',
    'ð', 'è', 'é', 'ê', 'ë', 'ę', 'ě', 'ĝ', 'ğ', 'ĥ', 'î', 'ì', 'í', 'ï', 
    'ı', 'ĵ', 'ł', 'ľ', 'ñ', 'ń', 'ň', 'ò', 'ö', 'ô', 'ó', 'ő', 'õ', 'ø',
    'ř', 'ŝ', 'ş', 'ś', 'š', 'ß', 'ť', 'þ', 'ù', 'ú', 'û', 'ŭ', 'ü', 'ű',
    'ů', 'ý', 'ź', 'ż', 'ž'
], axis=1)

df_freq.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'language'],
      dtype='object', name='letter')

In [9]:
print(df_freq.shape)
df_freq.head()

(16, 27)


letter,a,b,c,d,e,f,g,h,i,j,...,r,s,t,u,v,w,x,y,z,language
0,0.08167,0.01492,0.02782,0.04253,0.12702,0.02228,0.02015,0.06094,0.06966,0.00153,...,0.05987,0.06327,0.09056,0.02758,0.00978,0.0236,0.0015,0.01974,0.00074,english
1,0.07636,0.00901,0.0326,0.03669,0.14715,0.01066,0.00866,0.00737,0.07529,0.00613,...,0.06693,0.07948,0.07244,0.06311,0.01838,0.00049,0.00427,0.00128,0.00326,french
2,0.06516,0.01886,0.02732,0.05076,0.16396,0.01656,0.03009,0.04577,0.0655,0.00268,...,0.07003,0.0727,0.06154,0.04166,0.00846,0.01921,0.00034,0.00039,0.01134,german
3,0.11525,0.02215,0.04019,0.0501,0.12181,0.00692,0.01768,0.00703,0.06247,0.00493,...,0.06871,0.07977,0.04632,0.02927,0.01138,0.00017,0.00215,0.01008,0.00467,spanish
4,0.14634,0.01043,0.03882,0.04992,0.1257,0.01023,0.01303,0.00781,0.06186,0.00397,...,0.0653,0.06805,0.04336,0.03639,0.01575,0.00037,0.00253,6e-05,0.0047,portuguese


## Data split and model creation

In [10]:
# train data

X = df_freq.drop('language', axis=1).values
y = df_freq['language']

X.shape, y.shape

((16, 26), (16,))

In [11]:
y

0        english
1         french
2         german
3        spanish
4     portuguese
5      esperanto
6        italian
7        turkish
8        swedish
9         polish
10         dutch
11        danish
12     icelandic
13       finnish
14         czech
15     hungarian
Name: language, dtype: object

In [12]:
# creating a model

model = GaussianNB()
model.fit(X, y)

GaussianNB()

## Reading and processing the books

In [13]:
# book process pipeline

def process_book(book_path):
    raw_book = read_raw_book(book_path)
    book = clean_raw_book(raw_book)

    chars_count = count_chars(book)
    chars_freq_df = get_freq_df(chars_count, book)

    return chars_freq_df, book

In [14]:
books = [
    'deeplearningbook',
    'ml_with_sklearn_keras_tf_en',
    'ml_with_sklearn_keras_tf_pt'
]

books_mapper = {book: (None, None) for book in books}
books_mapper

{'deeplearningbook': (None, None),
 'ml_with_sklearn_keras_tf_en': (None, None),
 'ml_with_sklearn_keras_tf_pt': (None, None)}

In [15]:
# reading and processing the books

for book_name in books:
    path_to_book = f'./books/{book_name}.pdf'
    df_book_chars, book = process_book(path_to_book)

    books_mapper[book_name] = (df_book_chars, book)
    print(f'book {book_name} completed!')

book deeplearningbook completed!
book ml_with_sklearn_keras_tf_en completed!
book ml_with_sklearn_keras_tf_pt completed!


In [24]:
# displaying the frequencies of the books

for book_name, (df_book_freq, book) in books_mapper.items():
    print('book name =', book_name)
    
    print('book header')
    print(book[:100], end='\n')
    
    display(df_book_freq)
    print()

book name = deeplearningbook
book header
deeplearningiangoodfellowyoshuabengioaaroncourvillecontentswebsiteviiacknowledgmentsviiinotationxiin


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.082568,0.015325,0.034255,0.032905,0.121933,0.02195,0.021133,0.039973,0.082644,0.001559,...,0.0016,0.064572,0.061125,0.093327,0.027048,0.011791,0.013638,0.006727,0.013264,0.002742



book name = ml_with_sklearn_keras_tf_en
book header
aureliengeronhandsonmachinelearningwithscikitlearnkerasandtensorflowconceptstoolsandtechniquestobuil


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.081389,0.012148,0.034525,0.031778,0.119757,0.021384,0.022313,0.039544,0.081464,0.00136,...,0.001707,0.065115,0.074335,0.096437,0.02907,0.009148,0.012042,0.004582,0.016585,0.001507



book name = ml_with_sklearn_keras_tf_pt
book header
maosaobraaprendizadodemaquinacomscikitlearntensorflowconceitosferramentasetecnicasparaaconstrucaodes


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.126552,0.008377,0.043037,0.05266,0.1229,0.012133,0.014027,0.008354,0.068913,0.002889,...,0.007424,0.069639,0.072776,0.053709,0.037647,0.014123,0.001585,0.005018,0.002081,0.006016





## Making predictions using the `GaussianNB` model

In [17]:
sample_dl = books_mapper['deeplearningbook'][0].values

print(sample_dl)
print('\n prediction =', model.predict(sample_dl))

[[0.08256754 0.01532465 0.03425492 0.03290464 0.12193262 0.02194958
  0.02113298 0.03997333 0.08264357 0.00155937 0.0053039  0.04147056
  0.02801306 0.07572401 0.07221854 0.02693985 0.00160031 0.0645723
  0.06112532 0.09332667 0.02704805 0.01179139 0.01363807 0.00672656
  0.01326376 0.0027415 ]]

 prediction = ['english']


In [18]:
sample_ml_en = books_mapper['ml_with_sklearn_keras_tf_en'][0].values

print(sample_ml_en)
print('\n prediction =', model.predict(sample_ml_en))

[[0.08138934 0.01214779 0.03452485 0.03177776 0.11975677 0.02138444
  0.02231311 0.03954356 0.08146441 0.00135964 0.00555812 0.04709804
  0.02776557 0.06914701 0.07087367 0.02335578 0.0017072  0.06511536
  0.07433533 0.09643713 0.02906961 0.00914768 0.01204213 0.00458218
  0.01658538 0.001507  ]]

 prediction = ['english']


In [19]:
sample_ml_pt = books_mapper['ml_with_sklearn_keras_tf_pt'][0].values

print(sample_ml_pt)
print('\n prediction =', model.predict(sample_ml_pt))

[[0.12655198 0.00837724 0.04303695 0.05265958 0.12290006 0.01213303
  0.01402709 0.00835416 0.06891314 0.00288898 0.00143122 0.03370864
  0.04681121 0.05848833 0.09898603 0.03071232 0.00742387 0.06963914
  0.07277627 0.05370876 0.0376468  0.01412289 0.00158473 0.0050185
  0.00208104 0.00601573]]

 prediction = ['portuguese']


## Making predictions using `mean_squared_error`

In [20]:
languages = df_freq['language'].values
languages

array(['english', 'french', 'german', 'spanish', 'portuguese',
       'esperanto', 'italian', 'turkish', 'swedish', 'polish', 'dutch',
       'danish', 'icelandic', 'finnish', 'czech', 'hungarian'],
      dtype=object)

In [21]:
# deep learning book [english]

mse_predict(
    sample=sample_dl,
    df_freq=df_freq,
    X=X
)

'english'

In [22]:
# ml with sklearn, keras and tf [english]

mse_predict(
    sample=sample_ml_en,
    df_freq=df_freq,
    X=X
)

'english'

In [23]:
# ml with sklearn, keras and tf [portuguese]

mse_predict(
    sample=sample_ml_pt,
    df_freq=df_freq,
    X=X
)

'portuguese'