In [54]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.models import Sequential
from keras.models import load_model
from keras.utils import to_categorical
from keras.layers import Activation, Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
import keras.optimizers
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from unidecode import unidecode

max_letters=12
char_count=104

In [55]:
# Source file directory
path_train = "./training"
 
files_train = skds.load_files(path_train,load_content=False)
 
label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames

data_tags = ["language","wikitext"]
data_list = []

In [56]:
data_tags

['language', 'wikitext']

In [57]:
# Read and add data from file to a list
i=0
for f in labelled_files:
    # data_list.append((f,label_names[label_index[i]],Path(f).read_text(encoding="utf8")))
    data_list.append((label_names[label_index[i]],Path(f).read_text(encoding="utf8")))
    i += 1
 
# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)

In [58]:
# instead of all this, should just stick with unsplit data files

data_english = data.loc[data['language'] == "english"].reset_index(drop=True)
data_german = data.loc[data['language'] == "german"].reset_index(drop=True)
data_french = data.loc[data['language'] == "french"].reset_index(drop=True)
data_romanian = data.loc[data['language'] == "romanian"].reset_index(drop=True)



data_sorted=pd.concat([data_english, data_german, data_french, data_romanian]).sort_index(kind='mergesort').reset_index(drop=True)
data_sorted.head(20)
data_english.head()
data_english_m=data_english.groupby('language')['wikitext'].apply(' '.join).reset_index()
data_french_m=data_french.groupby('language')['wikitext'].apply(' '.join).reset_index()
data_german_m=data_german.groupby('language')['wikitext'].apply(' '.join).reset_index()
data_romanian_m=data_romanian.groupby('language')['wikitext'].apply(' '.join).reset_index()

data_grouped=data.groupby('language')['wikitext'].apply(' '.join).reset_index()
data_grouped.head()

Unnamed: 0,language,wikitext
0,english,Zealand and Australia for the first time The b...
1,french,parlement de Bosnie-Herzégovine Les électeurs ...
2,german,die westliche Linke könne die Sowjetunion kein...
3,romanian,în secolul al XIV-lea și comiți ai comitatului...


In [59]:
def process(page_content, max_word_length):
    count=0
    # unidecode will replace special chars like ă or ü with a or u
    #page_content = unidecode(page_content)
    words = re.sub(r'[^a-zA-Z ]', ' ', page_content)
    lower = words.lower()
    word_list = lower.split()
    short_words = []
    for word in word_list:
        # set to lower number for testing, limits total number of words used
        if count >= 10000:
            break
        if len(word) <= max_word_length and len(word) > 2:
            short_words.append(word)
            count=count+1
    
    # sorting may be better for machine learning but will cause loss of end of alphabet words if full list is not used
    #short_words=list(set(short_words))
    #short_words.sort()
    return short_words

test= process(data_grouped.loc[data_grouped.language == 'english','wikitext'].tolist()[0], 12)
print(len(test))



10000


In [60]:
# one hot encode words here
def convert_dic_to_vector(dic, max_word_length):
    new_list = []
    for word in dic:
        vec = ''
        n = len(word)
        for i in range(n):
            current_letter = word[i]
            ind = ord(current_letter)-97
            #ind = ord(current_letter)
            placeholder = (str(0)*ind) + str(1) + str(0)*((char_count-1)-ind)
            vec = vec + placeholder
        if n < max_word_length:
            excess = max_word_length-n
            vec = vec +str(0)*char_count*excess
        new_list.append(vec)
    print(len(new_list))
    return new_list

In [61]:
# test special charactor values here
ord("ă")-97

162

In [62]:
# one hot encode words here
def create_output_vector(tag_index, number_of_languages):
    out = str(0)*tag_index + str(1) + str(0)*(number_of_languages-1-tag_index)
    return out

In [63]:
# build dictionaries to train from
word_data = []
language_data = []
master_dic = []

count = 0

for lang in label_names:
    print('generating dictionary for ' + lang)
    dic = process(data_grouped.loc[data_grouped.language == lang,'wikitext'].tolist()[0], max_letters)
    for word in dic:
        master_dic.append(word)
    vct = convert_dic_to_vector(dic, max_letters)
    for vector in vct:
        word_data.append(vector)
    output_vct = create_output_vector(count, len(label_names))
    for i in range(len(vct)):
        language_data.append(output_vct)
    count += 1

generating dictionary for english
10000
generating dictionary for french
10000
generating dictionary for german
10000
generating dictionary for romanian
10000


In [64]:
#data_grouped.loc[data_grouped.language == "romanian",'wikitext'].tolist()[0]

In [65]:
# one hot encoded dataframe
arr = []
for i in range(len(word_data)):
    entry = []
    entry.append(master_dic[i])
    for digit in language_data[i]:
        entry.append(float(digit))
    for digit in word_data[i]:
        entry.append(float(digit))
    arr.append(entry)
    #print(entry)


arr = np.array(arr)
np.save('arr.npy', arr)
#df=pd.DataFrame(arr)
#df.to_csv('data.csv')

In [66]:
df=pd.DataFrame(arr)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1243,1244,1245,1246,1247,1248,1249,1250,1251,1252
0,zealand,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,and,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,australia,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,for,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,the,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
data = np.load('arr.npy')

In [68]:
#split into train and test, verify array shapes

inputs = data[:, 2+len(label_names):]
labels = data[:, 1:1+len(label_names)]

#inputs = data[:, 6:]
#labels = data[:, 1:5]

x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.20)

print(x_test.shape)
print(y_test.shape)
print(x_train.shape)
print(y_train.shape)

(8000, 1247)
(8000, 4)
(32000, 1247)
(32000, 4)


In [69]:
# build the brain
network = Sequential()
network.add(Dense(200, input_dim=(char_count*max_letters)-1, activation='sigmoid'))
network.add(Dense(150, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(100, activation='sigmoid'))
network.add(Dense(len(label_names), activation='softmax'))

network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [70]:
# settings for loggin, nice to use with tensorboard
filepath = "weights.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, mode='max')
tboard = TensorBoard(log_dir='logs', write_graph=True, write_images=True)
callbacks_list = [checkpoint, tboard]

#train the brain, maybe best is 200 epochs and 1000 batch size
network.fit(x_train, y_train, epochs=10, batch_size=1000, validation_data=(x_test, y_test), callbacks=callbacks_list)


Train on 32000 samples, validate on 8000 samples
Epoch 1/10

Epoch 00001: saving model to weights.hdf5
Epoch 2/10

Epoch 00002: saving model to weights.hdf5
Epoch 3/10

Epoch 00003: saving model to weights.hdf5
Epoch 4/10

Epoch 00004: saving model to weights.hdf5
Epoch 5/10

Epoch 00005: saving model to weights.hdf5
Epoch 6/10

Epoch 00006: saving model to weights.hdf5
Epoch 7/10

Epoch 00007: saving model to weights.hdf5
Epoch 8/10

Epoch 00008: saving model to weights.hdf5
Epoch 9/10

Epoch 00009: saving model to weights.hdf5
Epoch 10/10

Epoch 00010: saving model to weights.hdf5


<keras.callbacks.callbacks.History at 0x1b70d4354e0>

In [73]:
    # make prediction and show guess percents
    #ă ü
    dic = []
    dic.append("bonjour")
    vct_str = convert_dic_to_vector(dic, max_letters-1)
    vct = np.zeros((1, (char_count * max_letters)-1))
    count = 0
    print(len(vct_str[0]))
    for digit in vct_str[0]:
        vct[0,count] = int(digit)
        count += 1
    prediction_vct = network.predict(vct)

    langs = list(label_names)
    for i in range(len(label_names)):
        lang = langs[i]
        score = prediction_vct[0][i]
        print(lang + ': ' + str(round(100*score, 2)) + '%')
    print('\n')

1
1144
english: 79.38%
french: 16.13%
german: 1.0%
romanian: 3.49%




In [74]:
# explicitly save model
network.save('lang_detect.hdf5')

In [52]:
# delete to test loading of model
del network

In [53]:
# load model
network = load_model('lang_detect.hdf5')
