In [1]:
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.datasets.base import Bunch
import json
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn import metrics
from sklearn.linear_model import SGDClassifier

# Prepare

## Format dataset

In [2]:
dataset = Bunch()
dataset.data = [];
dataset.target = [];
dataset.target_names = ['notjob', 'job'];

## Import data

In [3]:
datasetNames = [('bbc', 0), ('medium', 0), ('jobs', 1)]
for datasetName in datasetNames:
    with open('dataset/' + datasetName[0] + '.json') as f:
        for line in f:
            texts = json.loads(line)
            print(datasetName[0], ': ', len(texts))
            for text in texts:
                dataset.data.append(text)
                dataset.target.append(datasetName[1])

bbc :  2225
medium :  496
jobs :  4106


### Split train / test 

In [4]:
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)

# Classifiers

## Naive Bayes

### Building

In [5]:
text_clf = Pipeline([
    ('vect', CountVectorizer(
                min_df=0.1,
                max_df=0.3
            )),
    ('tfidf', TfidfTransformer(
                sublinear_tf=True
            )),
    ('clf', MultinomialNB())
])

### Train

In [6]:
text_clf = text_clf.fit(data_train, target_train)

### Evaluation of the performance

In [7]:
predicted = text_clf.predict(data_test)
np.mean(predicted == target_test)

0.97363796133567659

In [8]:
print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.95      0.99      0.97       674
        job       0.99      0.96      0.98      1033

avg / total       0.97      0.97      0.97      1707



In [9]:
# This text used to be classified as job at ~0.90
docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']

results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.717468641733
job 0.282531358267


## Stochastic Gradient Descent (SGD)

### Building

In [10]:
text_clf = Pipeline([
    ('vect', CountVectorizer(
            min_df=0.1,
            max_df=0.3
            )),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='modified_huber', penalty='l2',
    alpha=1e-3, n_iter=5, random_state=42)),
 ])

### Train

In [11]:
text_clf = text_clf.fit(data_train, target_train)

### Evaluation of the performance

In [12]:
predicted = text_clf.predict(data_test)

print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.99      0.99      0.99       674
        job       0.99      0.99      0.99      1033

avg / total       0.99      0.99      0.99      1707



In [13]:
# This text used to be classified as job at ~0.90
docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']
results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.656933796975
job 0.343066203025
