In [166]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.models import Sequential
from keras.models import load_model
from keras.utils import to_categorical
from keras.layers import Activation, Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
import keras.optimizers
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import os
from unidecode import unidecode
from keras import layers

max_letters=12
char_count=26
#char_count=104

In [167]:
file_names = os.listdir('./training/clean/')
# Create Dictionary for language (= File name) and Text -- rename the files such as: en_clean.txt = english etc.
file_name_and_text = {}
for file in file_names:
    with open('./training/clean/' + file, "r",encoding="utf" ) as target_file:
         file_name_and_text[file] = target_file.read()
data_grouped = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'language', 0: 'wikitext'}))
label_names = data_grouped['language'].tolist()
print(label_names)
data_grouped.head()

['English', 'French', 'German', 'Romanian']


Unnamed: 0,language,wikitext
0,English,History of the Jews in Romania The history of...
1,French,La Roche-sur-Yon La Roche-sur-Yon est une com...
2,German,Paris Paris ist die Hauptstadt der Französisc...
3,Romanian,Avrig Avrig în dialectul săsesc Frek Fraek în...


In [168]:
def process(page_content, max_word_length):
    count=0
    # unidecode will replace special chars like ă or ü with a or u
    #page_content = unidecode(page_content)
    words = re.sub(r'[^a-zA-Z ]', ' ', page_content)
    lower = words.lower()
    word_list = lower.split()
    short_words = []
    for word in word_list:
        # set to lower number for testing, limits total number of words used
        if count >= 100000:
            break
            # and len(word) > 2
        if len(word) <= max_word_length and len(word) > 1:
            short_words.append(word)
            count=count+1
    
    # sorting may be better for machine learning but will cause loss of end of alphabet words if full list is not used
    #short_words=list(set(short_words))
    #short_words.sort()
    return short_words

#test= process(data_grouped.loc[data_grouped.language == 'English','wikitext'].tolist()[0], 12)
#print(len(test))



In [169]:
# one hot encode words here
def convert_dic_to_vector(dic, max_word_length):
    new_list = []
    for word in dic:
        vec = ''
        n = len(word)
        for i in range(n):
            current_letter = word[i]
            ind = ord(current_letter)-97
            #ind = ord(current_letter)
            placeholder = (str(0)*ind) + str(1) + str(0)*((char_count-1)-ind)
            vec = vec + placeholder
        if n < max_word_length:
            excess = max_word_length-n
            vec = vec +str(0)*char_count*excess
        new_list.append(vec)
    print(len(new_list))
    return new_list

In [170]:
# test special charactor values here
ord("ă")-97

162

In [171]:
# one hot encode words here
def create_output_vector(tag_index, number_of_languages):
    out = str(0)*tag_index + str(1) + str(0)*(number_of_languages-1-tag_index)
    return out

In [172]:
# build dictionaries to train from
word_data = []
language_data = []
master_dic = []

count = 0

for lang in label_names:
    print('generating dictionary for ' + lang)
    dic = process(data_grouped.loc[data_grouped.language == lang,'wikitext'].tolist()[0], max_letters)
    for word in dic:
        master_dic.append(word)
    vct = convert_dic_to_vector(dic, max_letters)
    for vector in vct:
        word_data.append(vector)
    output_vct = create_output_vector(count, len(label_names))
    for i in range(len(vct)):
        language_data.append(output_vct)
    count += 1

generating dictionary for English
100000
generating dictionary for French
100000
generating dictionary for German
100000
generating dictionary for Romanian
100000


In [173]:
#data_grouped.loc[data_grouped.language == "romanian",'wikitext'].tolist()[0]

In [174]:
# one hot encoded dataframe
arr = []
for i in range(len(word_data)):
    entry = []
    entry.append(master_dic[i])
    for digit in language_data[i]:
        entry.append(float(digit))
    for digit in word_data[i]:
        entry.append(float(digit))
    arr.append(entry)
    #print(entry)

#uncomment to save large arr
#arr = np.array(arr)
#np.save('arr.npy', arr)
#df=pd.DataFrame(arr)
#df.to_csv('data.csv')

In [175]:
#df=pd.DataFrame(arr)
#df.head()

In [176]:
#data = np.load('arr.npy')
data = np.array(arr)

In [177]:
#split into train and test, verify array shapes

inputs = data[:, 2+len(label_names):]
labels = data[:, 1:1+len(label_names)]

#inputs = data[:, 6:]
#labels = data[:, 1:5]

x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.20)

print(x_test.shape)
print(y_test.shape)
print(x_train.shape)
print(y_train.shape)

(80000, 311)
(80000, 4)
(320000, 311)
(320000, 4)


In [222]:
# build the brain
network = Sequential()
network.add(Dense(512, input_dim=(char_count*max_letters)-1))
network.add(Activation('relu'))
network.add(Dropout(0.5))
network.add(Dense(512, activation='sigmoid'))
network.add(Dropout(0.4))
network.add(Dense(512, activation='sigmoid'))
network.add(Dropout(0.3))
network.add(Dense(len(label_names), activation='softmax'))

network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [179]:
# network = Sequential()
# network.add(Dense(512, input_shape=((char_count*max_letters)-1,)))
# network.add(Activation('relu'))
# network.add(Dropout(0.3))
# network.add(Dense(512))
# network.add(Activation('relu'))
# network.add(Dropout(0.3))
# network.add(Dense(len(label_names)))
# network.add(Activation('softmax'))
# network.summary()
 
# network.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])
 
# history = network.fit(x_train, y_train,
#                     batch_size=1000,
#                     epochs=30,
#                     verbose=1,
#                     validation_split=0.1)

In [223]:
# settings for loggin, nice to use with tensorboard
filepath = "weights.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, mode='max')
tboard = TensorBoard(log_dir='logs', write_graph=True, write_images=True)
callbacks_list = [checkpoint, tboard]

#train the brain, maybe best is 200 epochs and 1000 batch size
network.fit(x_train, y_train, epochs=100, batch_size=1000, validation_data=(x_test, y_test), callbacks=callbacks_list)


Train on 320000 samples, validate on 80000 samples
Epoch 1/100

Epoch 00001: saving model to weights.hdf5
Epoch 2/100

Epoch 00002: saving model to weights.hdf5
Epoch 3/100

Epoch 00003: saving model to weights.hdf5
Epoch 4/100

Epoch 00004: saving model to weights.hdf5
Epoch 5/100

Epoch 00005: saving model to weights.hdf5
Epoch 6/100

Epoch 00006: saving model to weights.hdf5
Epoch 7/100

Epoch 00007: saving model to weights.hdf5
Epoch 8/100

Epoch 00008: saving model to weights.hdf5
Epoch 9/100

Epoch 00009: saving model to weights.hdf5
Epoch 10/100

Epoch 00010: saving model to weights.hdf5
Epoch 11/100

Epoch 00011: saving model to weights.hdf5
Epoch 12/100

Epoch 00012: saving model to weights.hdf5
Epoch 13/100

Epoch 00013: saving model to weights.hdf5
Epoch 14/100

Epoch 00014: saving model to weights.hdf5
Epoch 15/100

Epoch 00015: saving model to weights.hdf5
Epoch 16/100

Epoch 00016: saving model to weights.hdf5
Epoch 17/100

Epoch 00017: saving model to weights.hdf5
Epoch 


Epoch 00042: saving model to weights.hdf5
Epoch 43/100

Epoch 00043: saving model to weights.hdf5
Epoch 44/100

Epoch 00044: saving model to weights.hdf5
Epoch 45/100

Epoch 00045: saving model to weights.hdf5
Epoch 46/100

Epoch 00046: saving model to weights.hdf5
Epoch 47/100

Epoch 00047: saving model to weights.hdf5
Epoch 48/100

Epoch 00048: saving model to weights.hdf5
Epoch 49/100

Epoch 00049: saving model to weights.hdf5
Epoch 50/100

Epoch 00050: saving model to weights.hdf5
Epoch 51/100

Epoch 00051: saving model to weights.hdf5
Epoch 52/100

Epoch 00052: saving model to weights.hdf5
Epoch 53/100

Epoch 00053: saving model to weights.hdf5
Epoch 54/100

Epoch 00054: saving model to weights.hdf5
Epoch 55/100

Epoch 00055: saving model to weights.hdf5
Epoch 56/100

Epoch 00056: saving model to weights.hdf5
Epoch 57/100

Epoch 00057: saving model to weights.hdf5
Epoch 58/100

Epoch 00058: saving model to weights.hdf5
Epoch 59/100

Epoch 00059: saving model to weights.hdf5
Epoch


Epoch 00084: saving model to weights.hdf5
Epoch 85/100

Epoch 00085: saving model to weights.hdf5
Epoch 86/100

Epoch 00086: saving model to weights.hdf5
Epoch 87/100

Epoch 00087: saving model to weights.hdf5
Epoch 88/100

Epoch 00088: saving model to weights.hdf5
Epoch 89/100

Epoch 00089: saving model to weights.hdf5
Epoch 90/100

Epoch 00090: saving model to weights.hdf5
Epoch 91/100

Epoch 00091: saving model to weights.hdf5
Epoch 92/100

Epoch 00092: saving model to weights.hdf5
Epoch 93/100

Epoch 00093: saving model to weights.hdf5
Epoch 94/100

Epoch 00094: saving model to weights.hdf5
Epoch 95/100

Epoch 00095: saving model to weights.hdf5
Epoch 96/100

Epoch 00096: saving model to weights.hdf5
Epoch 97/100

Epoch 00097: saving model to weights.hdf5
Epoch 98/100

Epoch 00098: saving model to weights.hdf5
Epoch 99/100

Epoch 00099: saving model to weights.hdf5
Epoch 100/100

Epoch 00100: saving model to weights.hdf5


<keras.callbacks.callbacks.History at 0x170c99e6f98>

In [225]:
    # make prediction and show guess percents
    #ă ü
    dic = []
    dic.append(unidecode("gesundkeit").lower())
    vct_str = convert_dic_to_vector(dic, max_letters-1)
    vct = np.zeros((1, (char_count * max_letters)-1))
    count = 0
    print(len(vct_str[0]))
    for digit in vct_str[0]:
        vct[0,count] = int(digit)
        count += 1
    prediction_vct = network.predict(vct)

    langs = list(label_names)
    for i in range(len(label_names)):
        lang = langs[i]
        score = prediction_vct[0][i]
        print(lang + ': ' + str(round(100*score, 2)) + '%')
    print('\n')

1
286
English: 1.01%
French: 1.77%
German: 30.05%
Romanian: 67.18%




In [224]:
# explicitly save model
network.save('lang_detect_n2.hdf5')

In [183]:
# delete to test loading of model
del network

In [216]:
# load model
network = load_model('lang_detect_long.hdf5')


In [None]:
%tensorboard --logdir logs