In [1]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to N consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

The script saves the trained model to disk for later use
"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
# Adapted by: Francesco Mosconi

import numpy as np
from sklearn.datasets import load_files


# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print("Couldn't import the data, did you unzip the wikidata.zip folder?")
    exit(-1)

In [2]:
dataset.target

array([4, 5, 3, ..., 5, 3, 3])

In [3]:
dataset.target_names

['ar', 'de', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru']

In [4]:
docs = dataset.data
y = dataset.target

In [5]:
# TASK: Split the dataset in training and test set
# (use 20% of the data for test):
from sklearn.model_selection import train_test_split

docs_train, docs_test, y_train, y_test = train_test_split(
    docs, y, test_size=.2, random_state=42)

In [6]:
# TASK: Build a an vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')

In [7]:
# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
model = make_pipeline(vectorizer, clf)

In [8]:
# TASK: Fit the pipeline on the training set
model.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
# model

In [10]:
# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted
y_predicted = model.predict(docs_test)

In [11]:
# TASK: Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.96      0.98      0.97       193
          2       0.92      0.98      0.95       221
          3       0.93      0.94      0.93       213
          4       0.97      0.99      0.98       223
          5       0.97      0.96      0.97       198
          6       1.00      0.82      0.90        78
          7       0.99      0.97      0.98       112
          8       0.95      0.92      0.93       207
          9       1.00      0.98      0.99       185

avg / total       0.96      0.96      0.96      1692



In [12]:
# TASK: Print the confusion matrix. Bonus points if you make it pretty.
# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_test, y_predicted))

In [16]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [19]:
# TASK: Is the score good? Can you improve it changing
#       the parameters or the classifier?
#       Try using cross validation and grid search
# clf = RandomForestClassifier()
# clf = AdaBoostClassifier()
# vectorizer = TfidfVectorizer(ngram_range=(1, 6), analyzer='char')
# model_rt = make_pipeline(vectorizer, clf)
# model_rt.fit(docs_train, y_train)
# y_predicted_rt = model_rt.predict(docs_test)
print(classification_report(y_test, y_predicted))
# Can use descent classifier, log regression c=10 value

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.96      0.98      0.97       193
          2       0.92      0.98      0.95       221
          3       0.93      0.94      0.93       213
          4       0.97      0.99      0.98       223
          5       0.97      0.96      0.97       198
          6       1.00      0.82      0.90        78
          7       0.99      0.97      0.98       112
          8       0.95      0.92      0.93       207
          9       1.00      0.98      0.99       185

avg / total       0.96      0.96      0.96      1692



In [14]:
# TASK: Use dill and gzip to persist the trained model in memory.
#       1) gzip.open a file called my_model.dill.gz
#       2) dump to the file both your trained classifier
#          and the target_names of the dataset (for later use)
#    They should be passed as a list [model, dataset.target_names]
