In [26]:
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.datasets.base import Bunch
import json
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import unicodedata
from collections import OrderedDict
import os

# ENV VAR

In [27]:
SHORTEN_DATASET = os.environ['SHORTEN_DATASET'] == 'True'
print('SHORTEN_DATASET:', SHORTEN_DATASET)

SHORTEN_DATASET: True


# Prepare

## Prepare dataset variables

In [41]:
articles_data = []
jobs_data = []

## Import data

Read files

_articles_

In [46]:
for filename in ['gizmodo']:
    with open('./dataset/' + filename + '.json') as f:
        lines = f.read().splitlines()
    for line in lines:
        articles_data.append(
            unicodedata.normalize("NFKD", json.loads(line)['body'])
        )

In [None]:
print(articles_data[1])

Huawei has just launched the Mate 9, an all-metal Android that feels very promising indeed. It's also rather large, featuring a finger-stretching 5.9-inch display. We've spent some quality time with it, and you can find our first impressions here. Unexpectedly, however, the company appears to be delaying the Mate 9's UK release. The press release lists France, Germany, Italy, Poland and Spain among its "first-wave" launch markets, but leaves out the UK. Completely. This despite execs saying that only the grey and silver models of the phone would be available to British consumers... so it's still likely to come here at some point... probably. Huawei hasn't mentioned any specific dates yet, and we've asked them to shed more light on the UK situation.


In [None]:
for filename in ['reed']:
    with open('./dataset/' + filename + '.json') as f:
        lines = f.read().splitlines()
    for line in lines:
        jobs_data.append(
            unicodedata.normalize("NFKD", json.loads(line)['description'])
        )

In [None]:
jobs_data[1]

'IT Strategic Supplier Manager - Various Locations - Permanent - £35,000 - £41,000 per annum plus benefits   The Company  An IT Strategic Supplier Manager is required by a specialist health authority that has recently undergone an IT transformation. They are currently recruiting for a number of opportunities within  their IT Department to deliver these specialists Greenfield projects.   This position can be based in either Birmingham, Colindale, Bristol, Manchester or Liverpool.   The Role  This position will act as an advocate for our organisation by positively promoting donation to your friends, family, colleagues and customers   Responsibilities  *You will act as custodian of the ICT Supplier management framework, ensuring suppliers meet their contractual obligations.  *You will ensure that contracts are delivering value for money  *You will provide support during re-procurement and tendering for new IT services to ensure contracts are fit for purpose and transition well into the op

_dataset size_

### Remove dupes

In [None]:
print('jobs_data size: ', len(jobs_data))
jobs_data = list(OrderedDict.fromkeys(jobs_data))
print('jobs_data deduped size: ', len(jobs_data))
print('articles_data size: ', len(articles_data))
articles_data = list(OrderedDict.fromkeys(articles_data))
print('articles_data deduped size: ', len(articles_data))

jobs_data size:  126536
jobs_data deduped size:  114005
articles_data size:  46995
articles_data deduped size:  46981


### Reduce size of dataset for experimentation

In [None]:
if SHORTEN_DATASET:
    jobs_data = jobs_data[0:20]
    articles_data = articles_data[0:20]

## Create dataset and target

In [None]:
dataset = Bunch()
dataset.data = [];
dataset.target = [];
dataset.target_names = ['notjob', 'job'];

for data in articles_data:
    dataset.data.append(data)
    dataset.target.append(0)
for data in jobs_data:
    dataset.data.append(data)
    dataset.target.append(1)
print('dataset.data size: ', len(dataset.data))

dataset.data size:  40


### Split train / test 

In [23]:
data_train, data_test, target_train, target_test = train_test_split(dataset.data, dataset.target)

NameError: name 'dataset' is not defined

# Classifiers

## Naive Bayes

### Building

In [88]:
text_clf = Pipeline([
    ('vect', CountVectorizer(
                min_df=0.1,
                max_df=0.3,
                stop_words='english'
            )),
    ('tfidf', TfidfTransformer(
                sublinear_tf=True
            )),
    ('clf', MultinomialNB())
])

### Train

In [89]:
text_clf = text_clf.fit(data_train, target_train)

KeyboardInterrupt: 

### Evaluation of the performance

In [8]:
predicted = text_clf.predict(data_test)
np.mean(predicted == target_test)

0.96660808435852374

In [9]:
print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.96      0.95      0.96       692
        job       0.97      0.98      0.97      1015

avg / total       0.97      0.97      0.97      1707



In [10]:
# This text used to be classified as job at ~0.90
docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']

results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.684198960661
job 0.315801039339


## Stochastic Gradient Descent (SGD)

### Building

In [67]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
 ])

In [68]:
# text_clf.get_params()

In [84]:
params = {
    'vect__max_df': 0.7,
    'vect__min_df': 0.2,
    'vect__ngram_range': (1, 2),
    'vect__lowercase': True,
    'vect__stop_words': 'english',
    'vect__strip_accents': 'ascii',
    'tfidf__sublinear_tf': True,
    'clf__loss': 'modified_huber',
    'clf__penalty': 'l2',
    'clf__alpha': 1e-05, 
    'clf__n_iter': 5, 
    'clf__random_state': 42
}
text_clf.set_params(**params)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
      ...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))])

### Train

In [85]:
text_clf = text_clf.fit(data_train, target_train)

### Evaluation of the performance

In [86]:
predicted = text_clf.predict(data_test)

print(metrics.classification_report(target_test, predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

     notjob       0.95      0.97      0.96       692
        job       0.98      0.97      0.97      1015

avg / total       0.97      0.97      0.97      1707



In [88]:
# This text used to be classified as job at ~0.90
docs_new = ['How do I get developer access to the Medium API? This is currently an early access program, so if you’re interested, let us know by emailing developers@medium.com. Once approved, the easiest way to get started on the platform is to use an SDK. There are currently three official SDKs, for Go, Python, and NodeJS. There is also a reference WordPress plugin that demonstrates PHP.']
results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 1.0
job 0.0


In [89]:
docs_new = ['The Data Analytics and Reporting team acts as a central service for all business analytics within ICE Data Services. The team takes an entrepreneurial approach to problem solving, partnering closely with the project management office to design and implement solutions to key operational problems through analysis. The team is critical in developing key performance indicators and the corresponding monitoring tools to empower management. With no two problems the same, a close partnership with executive management, and a continually increasing demand this position offers an exciting opportunity to grow and develop in one of the market’s most exciting roles']
results = text_clf.predict_proba(docs_new)[0]
print(dataset.target_names[0], results[0])
print(dataset.target_names[1], results[1])

notjob 0.0
job 1.0


# Export model

In [21]:
# joblib.dump(text_clf, 'model.pkl') 
# clf = joblib.load('filename.pkl') 

# Improvement and experimentation

### Cross Validation (work in progress)

In [16]:
# predicted = cross_val_score(text_clf, data_train, target_train, cv=10)

In [17]:
# predicted

In [18]:
# cv = KFold(len(dataset.data), n_folds=5, shuffle=True, random_state=241)
# print(cv)

### Grid Search

In [26]:
parameters = {
    'vect__max_df': (0.1, 0.5, 0.75)
}

gs = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)

In [27]:
gs.fit(dataset.data, dataset.target)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  2.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        ...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.1, 0.5, 0.75)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [28]:
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	vect__max_df: 0.1
