In [1]:
## plot boilerplate
%matplotlib inline
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
mpl.rcParams['figure.figsize'] = (12.0, 8.0)

## Prepare Test Data

Below we test the pipeline using the newsgroup data set.

First we create a target data set by writing newsgroup data labled `comp.graphics` from the training set to a json a jsonl file, `test_data/train.jsonl`.  This data set represents the resumes of employees who already fill the position being hired against.

Next we write all categories from the newsgroup test data set line by line to a jsonl file, `test_data/test.jsonl`.  Each line in this file represents a candidate's resume for comparison with the train set.

If the pipeline works then we would expect that candidate items labeled `comp.graphics` would score higher in general than items with other labels.

---

In [2]:
## prepare the data for testing
import json
from sklearn.datasets import fetch_20newsgroups

def ng_dirname(data_type):
    ''' simplify newsgroup data acess, data_type = 'test' or 'train'
    '''
    return 'test_data/%s.jsonl' % data_type

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, shuffle=True, random_state=42)

with open(ng_dirname('train'), 'w') as tf:
    for i in range(len(twenty_train.data)):
        if twenty_train.target_names[twenty_train.target[i]] == 'comp.graphics':
            json.dump({'label': int(twenty_train.target[i]),
                       'name': twenty_train.target_names[twenty_train.target[i]],
                       'content': twenty_train.data[i]
                  }, tf)
            tf.write('\n')
    
with open(ng_dirname('test'), 'w') as cf:
    for i in range(len(twenty_test.data)):
        json.dump({'label': int(twenty_test.target[i]),
                   'name': twenty_test.target_names[twenty_test.target[i]],
                   'content': twenty_test.data[i]
                  }, cf)
        cf.write('\n')

## Calculate Similarity 

---

In [None]:
from txtprocess import KeySelect, KeySelect2, StripTransform, CosineSim
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([
    ('bykey', FileIterKeySelect()),
    ('clean', StripTransform()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('cosine', CosineSim())
])

with open(ng_dirname('train'), 'r') as tf, open(ng_dirname('test'), 'r') as cf:
    d = 1 - pipe.fit(
        (json.loads(line) for line in tf), None
    ).predict((json.loads(line) for line in cf))

In [3]:
from txtprocess import KeySelect, FileIterKeySelect, StripTransform, CosineSim
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([
    ('bykey', FileIterKeySelect()),
    ('clean', StripTransform()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('cosine', CosineSim())
])

with open(ng_dirname('train'), 'r') as tf, open(ng_dirname('test'), 'r') as cf:
    _ = pipe.fit(tf)

TypeError: expected string or bytes-like object

## Plot Results

Above we defined `d` which is a vector of similarity scores for each document.  It is in the same order as the original candidate json list. We can use this order to combine the scores with the labels and generate plots or tables.

---

In [None]:
with open(ng_dirname('test'), 'r') as cf:
    result = [r for r in zip([json.loads(line)['name'] for line in cf], d)]
ax = plt.axes()
sns.violinplot(x="score", y="label",
              data=pd.DataFrame(result, columns=['label', 'score']),
              jitter=True)
ax.set_title('Plot 1: Cosine Similarity by Label')
plt.show()

Ok.. so not great. Comparing `comp.graphics` to the other labels in the `Plot 1` above we can see that we get a slight lift from the cosine similarity measure.

In [None]:
import numpy as np
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
#         ('clean', Pipeline([
#                     ('bykey', KeySelect()),
#                     ('clean', StripTransform())
#                 ])),
        ('features', Pipeline([
                    ('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer())
                ])),
        ('estimators', Pipeline([
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, n_iter=5, random_state=42))
                ]))
])

In [None]:
def json_load(line):
    return json.loads(line)
    
def select_keys(element):
    return element['content'], element['label']

with open(ng_dirname('train'), 'r') as tf, open(ng_dirname('test'), 'r') as cf:
    x, y = zip(*(select_keys(json_load(line)) for line in tf))
    _ = text_clf.fit(x, y)
    x, y = zip(*(select_keys(json_load(line)) for line in cf))
    predicted = text_clf.predict(x)
    print(np.mean(predicted == y))