In [12]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.datasets.base import Bunch
import json
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn import metrics
from sklearn.linear_model import SGDClassifier

In [13]:
dataset = Bunch()
dataset.data = [];
dataset.target = [];
dataset.target_names = ['notjob', 'job'];

In [14]:
# twenty_train.data = ['This is amazing', 'This is so cool', 'This is terrible'];
# twenty_train.target =  [0, 0, 1];

In [15]:
with open('dataset/bbc.json') as f:
    for line in f:
        texts = json.loads(line)
        for text in texts:
            dataset.data.append(text)
            dataset.target.append(0)
with open('dataset/medium.json') as f:
    for line in f:
        texts = json.loads(line)
        for text in texts:
            dataset.data.append(text)
            dataset.target.append(0)
with open('dataset/jobs.json') as f:
    for line in f:
        texts = json.loads(line)
        for text in texts:
            dataset.data.append(text)
            dataset.target.append(1)

In [16]:
text_clf = Pipeline([
    ('vect', CountVectorizer(
                min_df=0.1,
                max_df=0.3
            )),
    ('tfidf', TfidfTransformer(
                sublinear_tf=True
            )),
    ('clf', MultinomialNB())
])

In [17]:
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)
text_clf = text_clf.fit(data_train, target_train)

In [18]:
predicted = text_clf.predict(data_test)
np.mean(predicted == target_test)

0.97012302284710017

In [19]:
print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.95      0.98      0.96       713
        job       0.98      0.96      0.97       994

avg / total       0.97      0.97      0.97      1707



In [20]:
docs_new = ['How do I get developer access to the Medium API?This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']

results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.713042376461
job 0.286957623539


In [21]:
text_clf = Pipeline([
    ('vect', CountVectorizer(
            min_df=0.1,
            max_df=0.3
            )),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='modified_huber', penalty='l2',
    alpha=1e-3, n_iter=5, random_state=42)),
 ])
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)
text_clf = text_clf.fit(data_train, target_train)
predicted = text_clf.predict(data_test)

print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.99      0.99      0.99       680
        job       1.00      0.99      1.00      1027

avg / total       0.99      0.99      0.99      1707



In [22]:
docs_new = ['How do I get developer access to the Medium API?This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']
results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.689561143885
job 0.310438856115
