In [29]:
import re
import string
import numpy as np
import pandas as pd
from PyPDF2 import PdfReader
from unidecode import unidecode
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error

## Reading the characters frequency data

In [2]:
df_freq_raw = pd.read_html('./characters_freq/language_character_frequency.html')[0]
df_freq_raw.shape

(87, 17)

In [3]:
df_freq_raw.columns = [col.lower().split('[')[0] for col in df_freq_raw.columns]
df_freq_raw = df_freq_raw.set_index('letter', drop=True)
df_freq_raw.head()

Unnamed: 0_level_0,english,french,german,spanish,portuguese,esperanto,italian,turkish,swedish,polish,dutch,danish,icelandic,finnish,czech,hungarian
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
a,8.167%,7.636%,6.516%,11.525%,14.634%,12.117%,11.745%,11.920%,9.383%,8.965%,7.49%,6.025%,10.110%,12.217%,8.421%,10.778%
b,1.492%,0.901%,1.886%,2.215%,1.043%,0.980%,0.927%,2.844%,1.535%,1.482%,1.58%,2.000%,1.043%,0.281%,0.822%,2.647%
c,2.782%,3.260%,2.732%,4.019%,3.882%,0.776%,4.501%,0.963%,1.486%,3.988%,1.24%,0.565%,~0%,0.281%,0.740%,0.924%
d,4.253%,3.669%,5.076%,5.010%,4.992%,3.044%,3.736%,4.706%,4.702%,3.293%,5.93%,5.858%,1.575%,1.043%,3.475%,2.410%
e,12.702%,14.715%,16.396%,12.181%,12.570%,8.995%,11.792%,8.912%,10.149%,7.921%,18.91%,15.453%,6.418%,7.968%,7.562%,11.926%


In [4]:
df_freq_raw = df_freq_raw.T
df_freq_raw['language'] = df_freq_raw.index
df_freq_raw = df_freq_raw.reset_index(drop=True)

df_freq_raw.head()

letter,a,b,c,d,e,f,g,h,i,j,...,Ã»,Å­,Ã¼,Å±,Å¯,Ã½,Åº,Å¼,Å¾,language
0,8.167%,1.492%,2.782%,4.253%,12.702%,2.228%,2.015%,6.094%,6.966%,0.153%,...,~0%,0,~0%,0,0,0,0,0,0,english
1,7.636%,0.901%,3.260%,3.669%,14.715%,1.066%,0.866%,0.737%,7.529%,0.613%,...,0.060%,0,0,0,0,0,0,0,0,french
2,6.516%,1.886%,2.732%,5.076%,16.396%,1.656%,3.009%,4.577%,6.550%,0.268%,...,0,0,0.995%,0,0,0,0,0,0,german
3,11.525%,2.215%,4.019%,5.010%,12.181%,0.692%,1.768%,0.703%,6.247%,0.493%,...,0,0,0.012%,0,0,~0%,0,0,0,spanish
4,14.634%,1.043%,3.882%,4.992%,12.570%,1.023%,1.303%,0.781%,6.186%,0.397%,...,0,0,0.026%,0,0,0,0,0,0,portuguese


In [5]:
def get_percentage(string):
    char_list = [char for char in string if char.isnumeric() or char == '.']

    if char_list:
        number = ''
        for char in char_list:
            number += char
        return float(number) / 100
    else:
        return np.nan

In [6]:
df_freq = df_freq_raw.drop('language', axis=1).applymap(
    lambda x: get_percentage(x)
)

df_freq['language'] = df_freq_raw['language']

In [7]:
df_freq.head()

letter,a,b,c,d,e,f,g,h,i,j,...,Ã»,Å­,Ã¼,Å±,Å¯,Ã½,Åº,Å¼,Å¾,language
0,0.08167,0.01492,0.02782,0.04253,0.12702,0.02228,0.02015,0.06094,0.06966,0.00153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,english
1,0.07636,0.00901,0.0326,0.03669,0.14715,0.01066,0.00866,0.00737,0.07529,0.00613,...,0.0006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,french
2,0.06516,0.01886,0.02732,0.05076,0.16396,0.01656,0.03009,0.04577,0.0655,0.00268,...,0.0,0.0,0.00995,0.0,0.0,0.0,0.0,0.0,0.0,german
3,0.11525,0.02215,0.04019,0.0501,0.12181,0.00692,0.01768,0.00703,0.06247,0.00493,...,0.0,0.0,0.00012,0.0,0.0,0.0,0.0,0.0,0.0,spanish
4,0.14634,0.01043,0.03882,0.04992,0.1257,0.01023,0.01303,0.00781,0.06186,0.00397,...,0.0,0.0,0.00026,0.0,0.0,0.0,0.0,0.0,0.0,portuguese


In [8]:
df_freq.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ã', 'Ã¢',
       'Ã¡', 'Ã¥', 'Ã¤', 'Ã£', 'Ä', 'Ã¦', 'Å', 'Ã§', 'Ä', 'Ä', 'Ä', 'Ä',
       'Ã°', 'Ã¨', 'Ã©', 'Ãª', 'Ã«', 'Ä', 'Ä', 'Ä', 'Ä', 'Ä¥', 'Ã®', 'Ã¬',
       'Ã­', 'Ã¯', 'Ä±', 'Äµ', 'Å', 'Ä¾', 'Ã±', 'Å', 'Å', 'Ã²', 'Ã¶', 'Ã´',
       'Ã³', 'Å', 'Ãµ', 'Ã¸', 'Å', 'Å', 'Å', 'Å', 'Å¡', 'Ã', 'Å¥', 'Ã¾',
       'Ã¹', 'Ãº', 'Ã»', 'Å­', 'Ã¼', 'Å±', 'Å¯', 'Ã½', 'Åº', 'Å¼', 'Å¾',
       'language'],
      dtype='object', name='letter')

In [9]:
df_freq = df_freq.drop([
    'Ã', 'Ã¢',
    'Ã¡', 'Ã¥', 'Ã¤', 'Ã£', 'Ä', 'Ã¦', 'Å', 'Ã§', 'Ä', 'Ä', 'Ä', 'Ä',
    'Ã°', 'Ã¨', 'Ã©', 'Ãª', 'Ã«', 'Ä', 'Ä', 'Ä', 'Ä', 'Ä¥', 'Ã®', 'Ã¬',
    'Ã­', 'Ã¯', 'Ä±', 'Äµ', 'Å', 'Ä¾', 'Ã±', 'Å', 'Å', 'Ã²', 'Ã¶', 'Ã´',
    'Ã³', 'Å', 'Ãµ', 'Ã¸', 'Å', 'Å', 'Å', 'Å', 'Å¡', 'Ã', 'Å¥', 'Ã¾',
    'Ã¹', 'Ãº', 'Ã»', 'Å­', 'Ã¼', 'Å±', 'Å¯', 'Ã½', 'Åº', 'Å¼', 'Å¾'
], axis=1)

df_freq.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'language'],
      dtype='object', name='letter')

In [10]:
print(df_freq.shape)
df_freq.head()

(16, 27)


letter,a,b,c,d,e,f,g,h,i,j,...,r,s,t,u,v,w,x,y,z,language
0,0.08167,0.01492,0.02782,0.04253,0.12702,0.02228,0.02015,0.06094,0.06966,0.00153,...,0.05987,0.06327,0.09056,0.02758,0.00978,0.0236,0.0015,0.01974,0.00074,english
1,0.07636,0.00901,0.0326,0.03669,0.14715,0.01066,0.00866,0.00737,0.07529,0.00613,...,0.06693,0.07948,0.07244,0.06311,0.01838,0.00049,0.00427,0.00128,0.00326,french
2,0.06516,0.01886,0.02732,0.05076,0.16396,0.01656,0.03009,0.04577,0.0655,0.00268,...,0.07003,0.0727,0.06154,0.04166,0.00846,0.01921,0.00034,0.00039,0.01134,german
3,0.11525,0.02215,0.04019,0.0501,0.12181,0.00692,0.01768,0.00703,0.06247,0.00493,...,0.06871,0.07977,0.04632,0.02927,0.01138,0.00017,0.00215,0.01008,0.00467,spanish
4,0.14634,0.01043,0.03882,0.04992,0.1257,0.01023,0.01303,0.00781,0.06186,0.00397,...,0.0653,0.06805,0.04336,0.03639,0.01575,0.00037,0.00253,6e-05,0.0047,portuguese


## Model creation

In [11]:
X = df_freq.drop('language', axis=1).values
y = df_freq['language']

X.shape, y.shape

((16, 26), (16,))

In [12]:
y

0        english
1         french
2         german
3        spanish
4     portuguese
5      esperanto
6        italian
7        turkish
8        swedish
9         polish
10         dutch
11        danish
12     icelandic
13       finnish
14         czech
15     hungarian
Name: language, dtype: object

In [13]:
model = GaussianNB()
model.fit(X, y)

GaussianNB()

## Reading the book

In [16]:
def read_raw_book(path):
    reader = PdfReader(path)

    book_raw = ''
    for i in range(len(reader.pages)):
        page = reader.pages[i] 
        page_text = page.extract_text()
        book_raw += page_text   

    return book_raw

In [30]:
# characters cleaning

def clean_raw_book(raw_book):
    book = raw_book.lower()
    book = [char for char in book if char.isalpha()]

    book_text = ''
    for char in book:
        book_text += char

    book_text = re.sub(' +', '', book_text)
    book_text = unidecode(book_text)
    return book_text

In [19]:
# deep learning book
raw_book_dl_en = read_raw_book('./books/deeplearningbook.pdf')
len(raw_book_dl_en)

1556542

In [31]:
book_dl_en = clean_raw_book(raw_book_dl_en)
len(book_dl_en)

1367862

In [32]:
print(book_dl_en[:250])

deeplearningiangoodfellowyoshuabengioaaroncourvillecontentswebsiteviiacknowledgmentsviiinotationxiintroductionwhoshouldreadthisbookhistoricaltrendsindeeplearningiappliedmathandmachinelearningbasicslinearalgebrascalarsvectorsmatricesandtensorsmultiply


In [26]:
# ml with sklearn, keras and tf en
raw_book_ml_en = read_raw_book('./books/ml-with-sklearn-keras-tf-en.pdf')
len(raw_book_ml_en)

489579

In [33]:
book_ml_en = clean_raw_book(raw_book_ml_en)
len(book_ml_en)

359654

In [34]:
print(book_ml_en[:250])

aureliengeronhandsonmachinelearningwithscikitlearnkerasandtensorflowconceptstoolsandtechniquestobuildintelligentsystemssecondeditionbostonfarnhamsebastopoltokyobeijingbostonfarnhamsebastopoltokyobeijinglsihandsonmachinelearningwithscikitlearnkerasand


In [35]:
# ml with sklearn, keras and tf pt
raw_book_ml_pt = read_raw_book('./books/ml-with-sklearn-keras-tf-pt.pdf')
len(raw_book_ml_pt)

1152920

In [36]:
book_ml_pt = clean_raw_book(raw_book_ml_pt)
len(book_ml_pt)

866395

In [37]:
print(book_ml_pt[:250])

maosaobraaprendizadodemaquinacomscikitlearntensorflowconceitosferramentasetecnicasparaaconstrucaodesistemasinteligentesaureliengeronriodejaneirocgmiolohandsonmachinelearningindbmaosaobraaprendizadomaquinacomscikitlearnetensorflowcopyrightdastarlinalt


## Counting the characters in the book

In [19]:
characters_freq = {char: 0 for char in string.ascii_lowercase}
characters_freq

{'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0}

In [39]:
def count_chars(book):
    characters_freq = {char: 0 for char in string.ascii_lowercase}

    for char in book:
        if char in list(characters_freq.keys()):
            characters_freq[char] += 1

    characters_freq = {k: [v] for k, v in characters_freq.items()}
    return characters_freq

In [40]:
# deep learning book
dl_book_en_characters_freq = count_chars(book_dl_en)
dl_book_en_characters_freq

{'a': [112941],
 'b': [20962],
 'c': [46856],
 'd': [45009],
 'e': [166787],
 'f': [30024],
 'g': [28907],
 'h': [54678],
 'i': [113045],
 'j': [2133],
 'k': [7255],
 'l': [56726],
 'm': [38318],
 'n': [103580],
 'o': [98785],
 'p': [36850],
 'q': [2189],
 'r': [88326],
 's': [83611],
 't': [127658],
 'u': [36998],
 'v': [16129],
 'w': [18655],
 'x': [9201],
 'y': [18143],
 'z': [3750]}

In [41]:
# ml en
ml_book_en_characters_freq = count_chars(book_ml_en)
ml_book_en_characters_freq

{'a': [29272],
 'b': [4369],
 'c': [12417],
 'd': [11429],
 'e': [43071],
 'f': [7691],
 'g': [8025],
 'h': [14222],
 'i': [29299],
 'j': [489],
 'k': [1999],
 'l': [16939],
 'm': [9986],
 'n': [24869],
 'o': [25490],
 'p': [8400],
 'q': [614],
 'r': [23419],
 's': [26735],
 't': [34684],
 'u': [10455],
 'v': [3290],
 'w': [4331],
 'x': [1648],
 'y': [5965],
 'z': [542]}

In [42]:
# deep learning book pt
ml_book_pt_characters_freq = count_chars(book_ml_pt)
ml_book_pt_characters_freq

{'a': [109644],
 'b': [7258],
 'c': [37287],
 'd': [45624],
 'e': [106480],
 'f': [10512],
 'g': [12153],
 'h': [7238],
 'i': [59706],
 'j': [2503],
 'k': [1240],
 'l': [29205],
 'm': [40557],
 'n': [50674],
 'o': [85761],
 'p': [26609],
 'q': [6432],
 'r': [60335],
 's': [63053],
 't': [46533],
 'u': [32617],
 'v': [12236],
 'w': [1373],
 'x': [4348],
 'y': [1803],
 'z': [5212]}

In [50]:
def get_freq_df(freq_dict, book):
    df_freq = pd.DataFrame(freq_dict)

    total_chars = len(book)

    df_freq = df_freq / total_chars
    return df_freq

In [51]:
dl_freq_df = get_freq_df(dl_book_en_characters_freq, book_dl_en)
dl_freq_df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.082568,0.015325,0.034255,0.032905,0.121933,0.02195,0.021133,0.039973,0.082644,0.001559,...,0.0016,0.064572,0.061125,0.093327,0.027048,0.011791,0.013638,0.006727,0.013264,0.002742


In [52]:
ml_en_freq_df = get_freq_df(ml_book_en_characters_freq, book_ml_en)
ml_en_freq_df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.081389,0.012148,0.034525,0.031778,0.119757,0.021384,0.022313,0.039544,0.081464,0.00136,...,0.001707,0.065115,0.074335,0.096437,0.02907,0.009148,0.012042,0.004582,0.016585,0.001507


In [53]:
ml_pt_freq_df = get_freq_df(ml_book_pt_characters_freq, book_ml_pt)
ml_pt_freq_df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.126552,0.008377,0.043037,0.05266,0.1229,0.012133,0.014027,0.008354,0.068913,0.002889,...,0.007424,0.069639,0.072776,0.053709,0.037647,0.014123,0.001585,0.005018,0.002081,0.006016


In [54]:
sample_dl = dl_freq_df.values
sample_dl

array([[0.08256754, 0.01532465, 0.03425492, 0.03290464, 0.12193262,
        0.02194958, 0.02113298, 0.03997333, 0.08264357, 0.00155937,
        0.0053039 , 0.04147056, 0.02801306, 0.07572401, 0.07221854,
        0.02693985, 0.00160031, 0.0645723 , 0.06112532, 0.09332667,
        0.02704805, 0.01179139, 0.01363807, 0.00672656, 0.01326376,
        0.0027415 ]])

In [55]:
sample_ml_en = ml_en_freq_df.values
sample_ml_en

array([[0.08138934, 0.01214779, 0.03452485, 0.03177776, 0.11975677,
        0.02138444, 0.02231311, 0.03954356, 0.08146441, 0.00135964,
        0.00555812, 0.04709804, 0.02776557, 0.06914701, 0.07087367,
        0.02335578, 0.0017072 , 0.06511536, 0.07433533, 0.09643713,
        0.02906961, 0.00914768, 0.01204213, 0.00458218, 0.01658538,
        0.001507  ]])

In [56]:
sample_ml_pt = ml_pt_freq_df.values
sample_ml_pt

array([[0.12655198, 0.00837724, 0.04303695, 0.05265958, 0.12290006,
        0.01213303, 0.01402709, 0.00835416, 0.06891314, 0.00288898,
        0.00143122, 0.03370864, 0.04681121, 0.05848833, 0.09898603,
        0.03071232, 0.00742387, 0.06963914, 0.07277627, 0.05370876,
        0.0376468 , 0.01412289, 0.00158473, 0.0050185 , 0.00208104,
        0.00601573]])

## Making a prediction using the model

In [57]:
model.predict(sample_dl)

array(['english'], dtype='<U10')

In [58]:
model.predict(sample_ml_en)

array(['english'], dtype='<U10')

In [59]:
model.predict(sample_ml_pt)

array(['portuguese'], dtype='<U10')

## Making a prediction using Mean Squared Error

In [60]:
languages = df_freq['language'].values
languages

array(['english', 'french', 'german', 'spanish', 'portuguese',
       'esperanto', 'italian', 'turkish', 'swedish', 'polish', 'dutch',
       'danish', 'icelandic', 'finnish', 'czech', 'hungarian'],
      dtype=object)

In [63]:
def mse_predict(sample):
    languages = df_freq['language'].values

    mse = []
    for i in range(len(languages)):
        mse.append(mean_squared_error(X[i], sample[0]))

    index = np.argmin(mse)
    return languages[index]

In [65]:
mse_predict(sample_dl)

'english'

In [66]:
mse_predict(sample_ml_en)

'english'

In [67]:
mse_predict(sample_ml_pt)

'portuguese'