# Identify languages

## Links 
https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py

In [63]:
import pathlib
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from pandas.api.types import CategoricalDtype

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Get the dataset

Display the directory containing the languages data

In [91]:
!ls data/languages/short_paragraphs/

ar  de	en  es	fr  it	ja  nl	pl  pt	ru


Setup the language folder and the language code

In [92]:
languages_data_folder = pathlib.Path('.') / 'data' / 'languages' / 'short_paragraphs'
language_code =  {
    'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French',
    'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese',
    'ru': 'Russian'}

Load the dataset in the scikit format (parent directory with each category in child directory)

In [93]:
dataset = load_files(languages_data_folder)

Display information about the dataset

In [94]:
pd.Series({"number_of_documents": len(dataset.data), "languages": dataset.target_names})

number_of_documents                                            8782
languages              [ar, de, en, es, fr, it, ja, nl, pl, pt, ru]
dtype: object

Display the number of documents in each languages

In [95]:
lang_map = {idx: lang for idx, lang in enumerate(dataset.target_names)}
lang = pd.Categorical(dataset.target).rename_categories(lang_map)
lang.value_counts()

ar     298
de    1076
en    1082
es    1053
fr    1054
it    1019
nl     580
pl     608
pt    1055
ru     957
dtype: int64

Display an example Spanish document

In [96]:
es_doc_idx = np.nonzero(dataset.target == 3)[0][0]
print(dataset.data[es_doc_idx])

b'debido a la disparidad entre'


Split the data into training and test set

In [98]:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)

## Create the text features

Use a td-idf vectorizer with a sequence of 1 to 3 characters

In [112]:
# characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', use_idf=False)

## Create the classifier

Create a pipeline with two stages

In [144]:
clf = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron(tol=1e-3)),
])
clf.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
   ..._jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=0.001, verbose=0, warm_start=False))])

Use the classifier on the training data to predict language category

In [145]:
y_predicted = clf.predict(docs_test)

## Print the classification report

In [146]:
print(metrics.classification_report(
    y_test, y_predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

         ar       0.64      1.00      0.78       150
         de       0.97      0.98      0.97       519
         en       0.94      0.96      0.95       521
         es       0.92      0.89      0.91       540
         fr       0.95      0.94      0.95       532
         it       0.95      0.94      0.95       519
         ja       0.97      0.87      0.92       295
         nl       0.99      0.97      0.98       299
         pl       0.93      0.89      0.91       542
         pt       1.00      0.98      0.99       474

avg / total       0.95      0.94      0.94      4391



Print the confusion matrix

In [147]:
metrics.confusion_matrix(y_test, y_predicted)

array([[150,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  3, 510,   3,   0,   0,   0,   3,   0,   0,   0],
       [  8,   3, 498,   3,   4,   1,   0,   1,   3,   0],
       [ 18,   1,   4, 482,   7,  12,   0,   1,  15,   0],
       [ 11,   0,   2,   6, 502,   3,   3,   0,   5,   0],
       [  6,   1,   5,   7,   3, 488,   0,   1,   8,   0],
       [ 11,  10,   7,   1,   6,   0, 257,   1,   2,   0],
       [  3,   1,   1,   0,   0,   2,   0, 290,   2,   0],
       [ 21,   2,   4,  24,   3,   4,   0,   0, 484,   0],
       [  3,   0,   3,   0,   1,   1,   2,   0,   0, 464]])

Make a prediction on a simple example

In [149]:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]

predicted = clf.predict(sentences)
for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))

The language of "This is a language detection test." is "en"
The language of "Ceci est un test de détection de la langue." is "fr"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "de"
