In [54]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.models import Sequential
from keras.models import load_model
from keras.utils import to_categorical
from keras.layers import Activation, Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
import keras.optimizers
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import os
from unidecode import unidecode

max_letters=12
char_count=84

In [55]:
file_names = os.listdir('./training/clean/')
# Create Dictionary for language (= File name) and Text -- rename the files such as: en_clean.txt = english etc.
file_name_and_text = {}
for file in file_names:
    with open('./training/clean/' + file, "r",encoding="utf8" ) as target_file:
         file_name_and_text[file] = target_file.read()
data_grouped = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'language', 0: 'wikitext'}))
label_names = data_grouped['language'].tolist()
print(label_names)
data_grouped.head()

['English', 'French', 'German', 'Romanian']


Unnamed: 0,language,wikitext
0,English,History of the Jews in Romania The history of...
1,French,La Roche-sur-Yon La Roche-sur-Yon est une com...
2,German,Paris Paris ist die Hauptstadt der Französisc...
3,Romanian,Avrig Avrig în dialectul săsesc Frek Fraek în...


In [56]:
def process(page_content, max_word_length):
    count=0
    # unidecode will replace special chars like ă or ü with a or u
    #page_content = unidecode(page_content)
    words = re.sub(r'[^a-zA-Z ]', ' ', page_content)
    lower = words.lower()
    word_list = lower.split()
    short_words = []
    for word in word_list:
        # set to lower number for testing, limits total number of words used
        if count >= 100000:
            break
        if len(word) <= max_word_length and len(word) > 2:
            short_words.append(word)
            count=count+1
    
    # sorting may be better for machine learning but will cause loss of end of alphabet words if full list is not used
    #short_words=list(set(short_words))
    #short_words.sort()
    return short_words

test= process(data_grouped.loc[data_grouped.language == 'English','wikitext'].tolist()[0], 12)
print(len(test))



100000


In [57]:
# one hot encode words here
def convert_dic_to_vector(dic, max_word_length):
    new_list = []
    for word in dic:
        vec = ''
        n = len(word)
        for i in range(n):
            current_letter = word[i]
            ind = ord(current_letter)-97
            #ind = ord(current_letter)
            placeholder = (str(0)*ind) + str(1) + str(0)*((char_count-1)-ind)
            vec = vec + placeholder
        if n < max_word_length:
            excess = max_word_length-n
            vec = vec +str(0)*char_count*excess
        new_list.append(vec)
    print(len(new_list))
    return new_list

In [58]:
# test special charactor values here
ord("ă")-97

162

In [59]:
# one hot encode words here
def create_output_vector(tag_index, number_of_languages):
    out = str(0)*tag_index + str(1) + str(0)*(number_of_languages-1-tag_index)
    return out

In [60]:
# build dictionaries to train from
word_data = []
language_data = []
master_dic = []

count = 0

for lang in label_names:
    print('generating dictionary for ' + lang)
    dic = process(data_grouped.loc[data_grouped.language == lang,'wikitext'].tolist()[0], max_letters)
    for word in dic:
        master_dic.append(word)
    vct = convert_dic_to_vector(dic, max_letters)
    for vector in vct:
        word_data.append(vector)
    output_vct = create_output_vector(count, len(label_names))
    for i in range(len(vct)):
        language_data.append(output_vct)
    count += 1

generating dictionary for English
100000
generating dictionary for French
100000
generating dictionary for German
100000
generating dictionary for Romanian
100000


In [61]:
#data_grouped.loc[data_grouped.language == "romanian",'wikitext'].tolist()[0]

In [None]:
# one hot encoded dataframe
arr = []
for i in range(len(word_data)):
    entry = []
    entry.append(master_dic[i])
    for digit in language_data[i]:
        entry.append(float(digit))
    for digit in word_data[i]:
        entry.append(float(digit))
    arr.append(entry)
    #print(entry)


arr = np.array(arr)
np.save('arr.npy', arr)
#df=pd.DataFrame(arr)
#df.to_csv('data.csv')

In [None]:
df=pd.DataFrame(arr)
df.head()

In [None]:
data = np.load('arr.npy')

In [None]:
#split into train and test, verify array shapes

inputs = data[:, 2+len(label_names):]
labels = data[:, 1:1+len(label_names)]

#inputs = data[:, 6:]
#labels = data[:, 1:5]

x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.20)

print(x_test.shape)
print(y_test.shape)
print(x_train.shape)
print(y_train.shape)

In [None]:
# build the brain
network = Sequential()
network.add(Dense(200, input_dim=(char_count*max_letters)-1, activation='sigmoid'))
network.add(Dense(150, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(len(label_names), activation='softmax'))

network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# settings for loggin, nice to use with tensorboard
filepath = "weights.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, mode='max')
tboard = TensorBoard(log_dir='logs', write_graph=True, write_images=True)
callbacks_list = [checkpoint, tboard]

#train the brain, maybe best is 200 epochs and 1000 batch size
network.fit(x_train, y_train, epochs=200, batch_size=1000, validation_data=(x_test, y_test), callbacks=callbacks_list)


In [39]:
    # make prediction and show guess percents
    #ă ü
    dic = []
    dic.append("bonjour")
    vct_str = convert_dic_to_vector(dic, max_letters-1)
    vct = np.zeros((1, (char_count * max_letters)-1))
    count = 0
    print(len(vct_str[0]))
    for digit in vct_str[0]:
        vct[0,count] = int(digit)
        count += 1
    prediction_vct = network.predict(vct)

    langs = list(label_names)
    for i in range(len(label_names)):
        lang = langs[i]
        score = prediction_vct[0][i]
        print(lang + ': ' + str(round(100*score, 2)) + '%')
    print('\n')

1
1144
English: 90.4%
French: 5.07%
German: 3.19%
Romanian: 1.35%




In [40]:
# explicitly save model
network.save('lang_detect.hdf5')

In [52]:
# delete to test loading of model
del network

In [53]:
# load model
network = load_model('lang_detect.hdf5')
