In [1]:
# Import libraries

# core libs
import random
from collections import Counter

# numpy
import numpy as np

# Sklearn
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# keras
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Conv1D, MaxPool1D, Flatten, Dense
from keras.models import Model

Using TensorFlow backend.


In [4]:
# Library versions
print(f'keras= {keras.__version__}')
print(f'sklearn= {sklearn.__version__}')
print(f'numpy= {np.__version__}')

keras= 2.2.4
sklearn= 0.20.4
numpy= 1.16.4


In [6]:
# Basic Configs
data_dir = '/media/divyesh/WorkSpace/Blogs/raw_data'
# Select to articles from file
num_of_articles = 10000
# Maximum sequence length
sentense_len = 150
# shingle configs
shingles_range = (70, 100, 130)
# how many shingles generate per line
shingle_per_line = 10 
# out of vocabulary token
oov_str = 'oov'

In [29]:
# language code wise full name mapping
lang_code_dict = {
    'en':'english', 'de':'german', 
    'fr':'french', 'it':'italian', 
    'es':'spanish'
}

In [8]:
# language code wise data file mapping
data_info = {
    'en' : data_dir + '/en.txt',
    'de' : data_dir + '/de.txt',
    'fr' : data_dir + '/fr.txt',
    'it' : data_dir + '/it.txt',
    'es' : data_dir + '/es.txt',
}

In [9]:
for lang_code, file_path in data_info.items():
    print(lang_code, lang_code_dict[lang_code], file_path)

en english /media/divyesh/WorkSpace/Blogs/raw_data/en.txt
de german /media/divyesh/WorkSpace/Blogs/raw_data/de.txt
fr french /media/divyesh/WorkSpace/Blogs/raw_data/fr.txt
it italian /media/divyesh/WorkSpace/Blogs/raw_data/it.txt
es spanish /media/divyesh/WorkSpace/Blogs/raw_data/es.txt


In [10]:
# data loading
data_dict = {}
for lang_code, file_path in data_info.items():
    with open(file_path, encoding='utf-8') as file:
        lines = file.readlines()
        lines = lines[:num_of_articles]
        # convert to lower case
        lines = [l.lower().strip() for l in lines]
        data_dict[lang_code] = lines
        print(lang_code, len(lines))

en 10000
de 10000
fr 10000
it 10000
es 10000


In [11]:
def generate_shingles(line, length, total):
    """
    Generate shingles from line
    """
    #todo: USE SET to remove REDUUUUUU
    shingle_list = [] 
    max_index = len(line) - length
    if max_index > 0:
        for _ in range(total):
            index = random.randint(0, max_index)
            shingle_text = line[index:index+length]
            shingle_list.append(shingle_text)
    else:
        shingle_list.append(line)
    return shingle_list

In [12]:
def generate_shingles_lines(line, length, total):
    """
    Generate shingles from list of lines
    """
    shingle_list = []
    for line in lines:
        shingles = generate_shingles(line=line, length=length, total=total)
        shingle_list.extend(shingles)
    return shingle_list

In [13]:
# generate shingles
shingle_data_dict = {}
for lang, lines in data_dict.items():
    shingle_list = []
    for s_range in shingles_range:
        shingles = generate_shingles_lines(lines, s_range, shingle_per_line)
        shingle_list.extend(shingles)
    shingle_data_dict[lang] = shingle_list
    print(lang, len(shingle_list))

en 300000
de 300000
fr 300000
it 300000
es 300000


In [14]:
# create list of lines and labels
data_lines, labels = [], []
for lang, samples in shingle_data_dict.items():
    data_lines.extend(samples)
    total_samples = len(samples)
    labels.extend([lang] * total_samples)
print(len(data_lines), len(labels))

1500000 1500000


In [15]:
# create list of all characters from all data lines
data_char_ls = []
for line in data_lines:
    char_ls = [c for c in line]
    data_char_ls.append(char_ls)
    
# count all characters
cunt = Counter(x for xs in data_char_ls for x in set(xs))

# create vocabulary
char_vocab = [c[0] for c in cunt.most_common(76)] + [oov_str]
print(char_vocab)

[' ', 'e', 'a', 'i', 'n', 'r', 't', 's', 'o', 'l', 'd', 'c', 'u', 'm', 'p', 'g', 'h', 'b', 'f', 'v', 'w', 'z', 'y', 'k', 'é', 'q', 'j', 'x', 'ó', 'í', 'ü', 'á', 'ä', 'è', 'ö', 'à', 'ñ', 'ú', 'ß', 'ò', 'ç', 'ù', 'ê', 'ô', 'â', 'î', 'ì', 'œ', 'û', 'ï', 'ō', '²', 'š', 'ë', 'č', 'ã', 'ł', 'ā', 'ø', 'ć', 'ū', 'ž', 'ı', 'å', 'ř', 'ş', 'ý', 'æ', 'α', 'ο', 'ă', 'о', 'а', 'ń', 'н', 'ν', 'oov']


In [16]:
# create dictionary for (char to index)
# here we (index + 1) becoz, 0 index for padding
ch2int = {c:i+1 for i, c in enumerate(char_vocab)}
print(ch2int)
print()
# create dictionary for (index to char)
int2ch = {i:c for c, i in ch2int.items()}
print(int2ch)

{' ': 1, 'e': 2, 'a': 3, 'i': 4, 'n': 5, 'r': 6, 't': 7, 's': 8, 'o': 9, 'l': 10, 'd': 11, 'c': 12, 'u': 13, 'm': 14, 'p': 15, 'g': 16, 'h': 17, 'b': 18, 'f': 19, 'v': 20, 'w': 21, 'z': 22, 'y': 23, 'k': 24, 'é': 25, 'q': 26, 'j': 27, 'x': 28, 'ó': 29, 'í': 30, 'ü': 31, 'á': 32, 'ä': 33, 'è': 34, 'ö': 35, 'à': 36, 'ñ': 37, 'ú': 38, 'ß': 39, 'ò': 40, 'ç': 41, 'ù': 42, 'ê': 43, 'ô': 44, 'â': 45, 'î': 46, 'ì': 47, 'œ': 48, 'û': 49, 'ï': 50, 'ō': 51, '²': 52, 'š': 53, 'ë': 54, 'č': 55, 'ã': 56, 'ł': 57, 'ā': 58, 'ø': 59, 'ć': 60, 'ū': 61, 'ž': 62, 'ı': 63, 'å': 64, 'ř': 65, 'ş': 66, 'ý': 67, 'æ': 68, 'α': 69, 'ο': 70, 'ă': 71, 'о': 72, 'а': 73, 'ń': 74, 'н': 75, 'ν': 76, 'oov': 77}

{1: ' ', 2: 'e', 3: 'a', 4: 'i', 5: 'n', 6: 'r', 7: 't', 8: 's', 9: 'o', 10: 'l', 11: 'd', 12: 'c', 13: 'u', 14: 'm', 15: 'p', 16: 'g', 17: 'h', 18: 'b', 19: 'f', 20: 'v', 21: 'w', 22: 'z', 23: 'y', 24: 'k', 25: 'é', 26: 'q', 27: 'j', 28: 'x', 29: 'ó', 30: 'í', 31: 'ü', 32: 'á', 33: 'ä', 34: 'è', 35: 'ö', 36: '

In [17]:
def encode(in_ls, key):
    """
    encode list of character to index of characters using 'char2int' dictionary
    """
    out_ls = []
    for ch in in_ls:
        index = key.get(ch)
        if index is None:
            index = key.get(oov_str)
        out_ls.append(index)
    return out_ls

In [18]:
# data encoding
encoded_ls = [encode(l, ch2int) for l in data_lines]
print(len(encoded_ls))

1500000


In [19]:
# observe count 'oov' in dataset
counts = 0
for enc in encoded_ls:
    if ch2int[oov_str] in enc:
        counts += 1
counts

15790

In [20]:
# padding and trucating of encoded sequence
X = pad_sequences(encoded_ls, maxlen=sentense_len, truncating='post', padding='post')

In [21]:
# target encoding from 'en' or 'de' language code to 0, 1 
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print(label_encoder.classes_)

['de' 'en' 'es' 'fr' 'it']


In [22]:
# one hot encoding of targets
y = to_categorical(encoded_labels)

In [23]:
print(X.shape, y.shape)

(1500000, 150) (1500000, 5)


In [24]:
# Train & Test split (70:30) ratio from full data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [26]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1050000, 150) (450000, 150) (1050000, 5) (450000, 5)


In [27]:
# Build the Neural network
inp = Input(shape=(sentense_len, ))
x = Embedding(input_dim=len(char_vocab) + 1, output_dim=32)(inp)
x = Conv1D(64, 3, activation='relu')(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPool1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPool1D(3)(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dense(5, activation='softmax')(x)
model = Model(inputs=inp, output=x)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 32)           2496      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 148, 64)           6208      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 146, 64)           12352     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 64)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 46, 128)           24704     
_________________________________________________________________
conv

  del sys.path[0]


In [28]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=5)

Instructions for updating:
Use tf.cast instead.
Train on 1050000 samples, validate on 450000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7e4fb0f710>

In [30]:
# prediction on test data
pred = model.predict(X_test)

In [31]:
pred_y = pred.argmax(axis=1).ravel()
actual_y = y_test.argmax(axis=1).ravel()

In [32]:
# Generate classification report
report = classification_report(actual_y, pred_y, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

          de       0.97      0.98      0.98     89924
          en       0.98      0.97      0.97     89480
          es       0.99      0.98      0.99     90056
          fr       0.97      0.98      0.97     90318
          it       0.98      0.98      0.98     90222

   micro avg       0.98      0.98      0.98    450000
   macro avg       0.98      0.98      0.98    450000
weighted avg       0.98      0.98      0.98    450000



In [33]:
def predict(line):
    """
    Prediction method for single line
    """
    line = line.lower()
    chars = [c for c in line]
    encoded = encode(chars, ch2int)
    padded = keras.preprocessing.sequence.pad_sequences([encoded], maxlen=sentense_len, truncating='post', padding='post')
    scores = model.predict(padded)
    max_index = scores[0].argmax()
    lbl = label_encoder.classes_[max_index]
    return lbl, scores[0][max_index]

In [34]:
# sample perdiction
print(predict('this is sample text'))

('en', 0.80307883)


In [35]:
# Real time data from google news
test_data = [
    ('en', 'Today rural India and its villages have declared themselves'),
    ('de', 'Es ist einer dieser Momente, bei denen man dabei gewesen sein will'),
    ('fr', 'Mais rien ne permet pour l’instant de confirmer ces propos.'),
    ('it', 'Il peso della compartecipazione dei cittadini (il ticket appunto) sarà cacolato'),
    ('es', 'Después de la evaluación y las pruebas médicas, se descubrió que tenía un')
]

In [36]:
# predict on real time data
for actual_lang, data in test_data:
    print('-----------------')
    print(f'Data:{data}')
    print(f'Predicted:{predict(data)}, Actual:{actual_lang}')

-----------------
Data:Today rural India and its villages have declared themselves
Predicted:('en', 0.9981042), Actual:en
-----------------
Data:Es ist einer dieser Momente, bei denen man dabei gewesen sein will
Predicted:('de', 0.99996316), Actual:de
-----------------
Data:Mais rien ne permet pour l’instant de confirmer ces propos.
Predicted:('fr', 0.9699458), Actual:fr
-----------------
Data:Il peso della compartecipazione dei cittadini (il ticket appunto) sarà cacolato
Predicted:('it', 0.99970454), Actual:it
-----------------
Data:Después de la evaluación y las pruebas médicas, se descubrió que tenía un
Predicted:('es', 0.999961), Actual:es
