In [6]:
scorer = Scoring(0.7)
scorer.weights("Ceci est un test pollution marine".split())

[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

In [66]:
# V1 without saving bm25 in same repo than spacy model
import copy
import spacy
import pickle
import numpy as np
class Scoring(object):
    def __init__(self, param):
        self.param = param
    def weights(self, document):
        return [self.param for token in document]
    
class ScorerComponent(object):
    name = "scorer_component"

    def __init__(self):
        self.data = {}
        self.IGNORED_POS = ['PRON', 'AUX', 'DET', "PUNCT"]

    def add_scorer(self, scorer):
        self.data["scorer"] = copy.deepcopy(scorer)

    # TODO: refactor
    def __call__(self, doc):
        " Score each non-punctuation token from spacy doc, and overwrite the vector representation using BM25 weights"
        word_tokens = [token.text for token in doc if token.pos_ not in self.IGNORED_POS]
        word_vectors = [token.vector for token in doc if token.pos_ not in self.IGNORED_POS]
        weights = self.data["scorer"].weights(word_tokens)
        doc.vector = np.average(word_vectors, weights=np.array(weights, dtype=np.float32),axis=0)
        return doc
    def to_disk(self, path, **kwargs):
        data_path = path / "words_scorer.pckl"
        print("Saving scorer to {}.".format(data_path))
        with open(data_path, "wb") as f:
            pickle.dump(self.data["scorer"], f)
    def from_disk(self, path, **kwargs):
        data_path = path / "words_scorer.pckl"
        print("Loading scorer.")
        with open(data_path, "rb") as f:
            self.data["scorer"] = pickle.load(f)
nlp = spacy.load('fr_core_news_md')

In [67]:
scorer = Scoring(0.6)
scorer_component = ScorerComponent()
scorer_component.add_scorer(scorer)
if "scorer_component" in str(nlp.pipeline):
    nlp.replace_pipe("scorer_component", scorer_component)
else:
    nlp.add_pipe(scorer_component)
nlp.to_disk("../../data/model/")

Saving scorer to ..\..\data\model\scorer_component\words_scorer.pckl.


In [56]:
# loading back
from spacy.language import Language
Language.factories["scorer_component"] = lambda nlp, **cfg: ScorerComponent()
nlp_l = spacy.load("../../data/model/")
nlp_l.pipeline

Loading scorer.


[('tagger', <spacy.pipeline.pipes.Tagger at 0x276c144cc88>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x276c1453228>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x276c1453348>),
 ('scorer_component', <__main__.ScorerComponent at 0x276c144ce88>)]

In [79]:
# nlp = spacy.load('fr_core_news_md')
a = nlp_l("est un test pollution marine").vector; b= nlp("est un test pollution marine").vector;

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]

0.83117425

In [86]:
def similarity_to_vector(doc, vector):
    """
    Extension method to Doc objects that return the cosine similarity.
    :param doc: a doc obtained from a spacy model
    :param vector: a vector of same dimension (numpy array).
    :return: cosine similarity (float)
    """
    if vector is None:
        raise ValueError("Forgotten 'vector' argument.")
    vec1 = doc.vector
    vec2 = vector
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]


spacy.tokens.Doc.set_extension("similarity_to_vector", method=similarity_to_vector, force=True)

In [87]:
nlp("Ceci est un test pollution marine")._.similarity_to_vector(nlp("Ceci est un test pollution marine error").vector)

0.9759711

In [70]:
nlp_l("est un test pollution marine").ents

()

In [88]:
from pathlib import Path

data_folder = Path("source_data/text_files/")

file_to_open = data_folder / "raw_data.txt"

In [104]:
Path("my/directory/").mkdir(parents=False, exist_ok=True)
