In [35]:
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.datasets.base import Bunch
import json
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import unicodedata
from collections import OrderedDict
import os
import time

# ENV VAR

In [2]:
SHORTEN_DATASET = os.environ['SHORTEN_DATASET'] == 'True'
print('SHORTEN_DATASET:', SHORTEN_DATASET)

SHORTEN_DATASET: True


# Prepare

## Prepare dataset variables

In [3]:
articles_data = []
jobs_data = []

## Import data

Read files

_articles_

In [40]:
for filename in ['gizmodo', 'lifehacker', 'techcrunch', 'theverge']:
    with open('./dataset/' + filename + '.json') as f:
        lines = f.read().splitlines()
    for line in lines:
        articles_data.append(
            unicodedata.normalize("NFKD", json.loads(line)['body'])
        )

In [41]:
print(articles_data[1])

Huawei has just launched the Mate 9, an all-metal Android that feels very promising indeed. It's also rather large, featuring a finger-stretching 5.9-inch display. We've spent some quality time with it, and you can find our first impressions here. Unexpectedly, however, the company appears to be delaying the Mate 9's UK release. The press release lists France, Germany, Italy, Poland and Spain among its "first-wave" launch markets, but leaves out the UK. Completely. This despite execs saying that only the grey and silver models of the phone would be available to British consumers... so it's still likely to come here at some point... probably. Huawei hasn't mentioned any specific dates yet, and we've asked them to shed more light on the UK situation.


In [42]:
for filename in ['reed']:
    with open('./dataset/' + filename + '.json') as f:
        lines = f.read().splitlines()
    for line in lines:
        jobs_data.append(
            unicodedata.normalize("NFKD", json.loads(line)['description'])
        )

In [43]:
jobs_data[1]

'IT Strategic Supplier Manager - Various Locations - Permanent - £35,000 - £41,000 per annum plus benefits   The Company  An IT Strategic Supplier Manager is required by a specialist health authority that has recently undergone an IT transformation. They are currently recruiting for a number of opportunities within  their IT Department to deliver these specialists Greenfield projects.   This position can be based in either Birmingham, Colindale, Bristol, Manchester or Liverpool.   The Role  This position will act as an advocate for our organisation by positively promoting donation to your friends, family, colleagues and customers   Responsibilities  *You will act as custodian of the ICT Supplier management framework, ensuring suppliers meet their contractual obligations.  *You will ensure that contracts are delivering value for money  *You will provide support during re-procurement and tendering for new IT services to ensure contracts are fit for purpose and transition well into the op

_dataset size_

### Remove dupes

In [44]:
print('jobs_data size: ', len(jobs_data))
jobs_data = list(OrderedDict.fromkeys(jobs_data))
print('jobs_data deduped size: ', len(jobs_data))
print('articles_data size: ', len(articles_data))
articles_data = list(OrderedDict.fromkeys(articles_data))
print('articles_data deduped size: ', len(articles_data))

jobs_data size:  131536
jobs_data deduped size:  114005
articles_data size:  222908
articles_data deduped size:  217707


### Reduce size of dataset for experimentation

In [45]:
# if SHORTEN_DATASET:
#     jobs_data = jobs_data[0:5000]
#     articles_data = articles_data[0:5000]

## Create dataset and target

In [46]:
dataset = Bunch()
dataset.data = [];
dataset.target = [];
dataset.target_names = ['notjob', 'job'];

for data in articles_data:
    dataset.data.append(data)
    dataset.target.append(0)
for data in jobs_data:
    dataset.data.append(data)
    dataset.target.append(1)
print('dataset.data size: ', len(dataset.data))

dataset.data size:  331712


### Split train / test 

In [47]:
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)

# Classifiers

## Naive Bayes

### Building

In [48]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [49]:
params = {
    'clf__alpha': 0.20000000000000001,
    'vect__max_df': 0.69999999999999996,
    'vect__min_df': 0.10000000000000001,
    'vect__lowercase': True,
    'vect__stop_words': 'english',
    'vect__strip_accents': 'ascii',
}
text_clf.set_params(**params)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
      ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])

### Train

In [50]:
start_time = time.time()
text_clf = text_clf.fit(data_train, target_train)
print('elapsed time: ',  time.time() - start_time)

elapsed time:  74.90340495109558


### Evaluation of the performance

In [51]:
predicted = text_clf.predict(data_test)
np.mean(predicted == target_test)

0.99529712521705571

In [52]:
print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       1.00      1.00      1.00     54524
        job       0.99      0.99      0.99     28404

avg / total       1.00      1.00      1.00     82928



In [53]:
# This text used to be classified as job at ~0.90
docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']

results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.923177868816
job 0.0768221311836


## Stochastic Gradient Descent (SGD)

### Building

In [18]:
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', SGDClassifier()),
#  ])

In [19]:
# text_clf.get_params()

In [20]:
# params = {
#     'vect__max_df': 0.5,
#     'vect__min_df': 0.4,
#     'vect__ngram_range': (1, 2),
#     'vect__lowercase': True,
#     'vect__stop_words': 'english',
#     'vect__strip_accents': 'ascii',
#     'tfidf__sublinear_tf': True,
#     'clf__loss': 'modified_huber',
#     'clf__penalty': 'l2',
#     'clf__alpha': 1e-05, 
#     'clf__n_iter': 5
# }
# text_clf.set_params(**params)

### Train

In [21]:
# text_clf = text_clf.fit(data_train, target_train)

### Evaluation of the performance

In [22]:
# predicted = text_clf.predict(data_test)

# print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

In [23]:
# # This text used to be classified as job at ~0.90
# docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']
# results = text_clf.predict_proba(docs_new)[0]
# print(dataset.target_names[0], results[0])
# print(dataset.target_names[1], results[1])

In [24]:
# docs_new = ['The Data Analytics and Reporting team acts as a central service for all business analytics within ICE Data Services. The team takes an entrepreneurial approach to problem solving, partnering closely with the project management office to design and implement solutions to key operational problems through analysis. The team is critical in developing key performance indicators and the corresponding monitoring tools to empower management. With no two problems the same, a close partnership with executive management, and a continually increasing demand this position offers an exciting opportunity to grow and develop in one of the market’s most exciting roles']
# results = text_clf.predict_proba(docs_new)[0]
# print(dataset.target_names[0], results[0])
# print(dataset.target_names[1], results[1])

# Export model

In [54]:
joblib.dump(text_clf, 'model.pkl') 
# clf = joblib.load('filename.pkl') 

['model.pkl']

# Improvement and experimentation

### Cross Validation (work in progress)

In [26]:
# predicted = cross_val_score(text_clf, data_train, target_train, cv=10)

In [27]:
# predicted

In [28]:
# cv = KFold(len(dataset.data), n_folds=5, shuffle=True, random_state=241)
# print(cv)

### Grid Search

In [33]:
parameters = {
    'vect__max_df': np.arange(0.7, 1, 0.1),
    'vect__min_df': np.arange(0.1, 0.5, 0.1),
    'clf__alpha':  np.arange(0.1, 1, 0.1)
}

gs = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)

In [34]:
gs.fit(dataset.data, dataset.target)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  9.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
      ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__min_df': array([ 0.1,  0.2,  0.3,  0.4]), 'clf__alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]), 'vect__max_df': array([ 0.7,  0.8,  0.9,  1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [37]:
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 0.20000000000000001
	vect__max_df: 0.69999999999999996
	vect__min_df: 0.10000000000000001
