In [369]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [370]:
folder = '/Users/adavani/Data/NLP/scikit-learn/paragraphs'
dataset = load_files(folder)

In [371]:
docs_train,docs_test,y_train,y_test=train_test_split(dataset.data,dataset.target,test_size=0.5)

In [372]:
vectorizer = TfidfVectorizer(ngram_range=(2,2),analyzer='char',use_idf=False)

In [373]:
vectorizer.fit(docs_train)

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
        vocabulary=None)

In [374]:
clf=Pipeline([
        ('vec',vectorizer),
        ('clf',LinearSVC())
    ])

In [375]:
clf.fit(docs_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [376]:
y_predicted = clf.predict(docs_test)
print(metrics.classification_report(y_test,y_predicted,target_names=dataset.target_names))

             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        15
         de       1.00      0.99      0.99        77
         en       0.99      1.00      0.99        68
         es       1.00      0.98      0.99        58
         fr       1.00      1.00      1.00        60
         it       1.00      1.00      1.00        45
         ja       1.00      1.00      1.00        33
         nl       1.00      1.00      1.00        23
         pl       0.95      1.00      0.98        21
         pt       1.00      1.00      1.00        47
         ru       1.00      1.00      1.00        35

avg / total       1.00      1.00      1.00       482



In [377]:
cm = metrics.confusion_matrix(y_test,y_predicted)
print(cm)

[[15  0  0  0  0  0  0  0  0  0  0]
 [ 0 76  0  0  0  0  0  0  1  0  0]
 [ 0  0 68  0  0  0  0  0  0  0  0]
 [ 0  0  1 57  0  0  0  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0  0]
 [ 0  0  0  0  0 45  0  0  0  0  0]
 [ 0  0  0  0  0  0 33  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0]
 [ 0  0  0  0  0  0  0  0  0 47  0]
 [ 0  0  0  0  0  0  0  0  0  0 35]]


In [378]:
sentences = [
    u'Ceci est un test de détection de la langue.',
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]

In [379]:
predicted = clf.predict(sentences)

In [380]:
for s,p in zip(sentences,predicted):
    print s,dataset.target_names[p]

Ceci est un test de détection de la langue. fr
This is a language detection test. fr
Ceci est un test de détection de la langue. fr
Dies ist ein Test, um die Sprache zu erkennen. de


In [381]:
from sklearn.linear_model import LogisticRegression
clf1 = Pipeline([
        ('vec',vectorizer),
        ('clf',LogisticRegression())
    ])
clf1.fit(docs_train,y_train)
y_predicted = clf1.predict(docs_test)

In [382]:
print(metrics.classification_report(y_test,y_predicted,target_names=dataset.target_names))

             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        15
         de       0.92      0.99      0.95        77
         en       0.97      1.00      0.99        68
         es       1.00      0.98      0.99        58
         fr       1.00      1.00      1.00        60
         it       1.00      1.00      1.00        45
         ja       0.94      1.00      0.97        33
         nl       1.00      0.65      0.79        23
         pl       0.95      0.90      0.93        21
         pt       1.00      1.00      1.00        47
         ru       1.00      1.00      1.00        35

avg / total       0.98      0.98      0.97       482



In [383]:
predicted = clf.predict(sentences)
for s,p in zip(sentences,predicted):
    print s,dataset.target_names[p]

Ceci est un test de détection de la langue. fr
This is a language detection test. fr
Ceci est un test de détection de la langue. fr
Dies ist ein Test, um die Sprache zu erkennen. de
