In [5]:
from neo4j.v1 import GraphDatabase
import pandas as pd
import json
import numpy as np

driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "zelda1"))

In [6]:
def get_names(tx):
    trans = tx.run("MATCH (n:Entity) RETURN n.name")
    names = []
    for record in trans:
        names.append(record["n.name"])
    return names

with driver.session() as session:
    names = session.read_transaction(get_names)

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfClassifier(BaseEstimator, ClassifierMixin):  
    """An example of classifier"""

    def __init__(self, ngrams=3):
        """
        Called when initializing the classifier
        """
        self.ngrams = ngrams
        self.tfidf = TfidfVectorizer(analyzer=self.ngrams_analyzer)

    def ngrams_analyzer(self, s):
        s = s.lower()
        ngrams = [s[r:r+self.ngrams] for r in range(len(s) + 1 - self.ngrams)]
        return ngrams

    def fit(self, X, y=None):
        assert (type(self.ngrams) == int), "ngrams parameter must be integer"

        self.names_ = np.array(X)
        self.matrix_ = self.tfidf.fit_transform(X)

        return self
    
    def get_similarity(self, X):
        matrix_to_predict = self.tfidf.transform(X)
        simlililarity = matrix_to_predict.dot(self.matrix_.T)
        return simlililarity

    def predict(self, X, y=None):
        try:
            getattr(self, "matrix_")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")
        
        similarity = self.get_similarity(X)
        
        # This should be improved
        simil = np.asarray(np.argmax(similarity,axis=1)).flatten()
        #simil_score = np.max(simlililarity, axis=1)
        
        return(self.names_[simil])

    def score(self, X, y=None):
        similarity = get_similarity(X)
        return(sum(self.predict(X))) 

In [8]:
tfidf = TfidfClassifier(3)
tfidf.fit(names)

TfidfClassifier(ngrams=3)

In [15]:
tfidf.predict(["dark lin","Gano n", "Zora riv", "zelda a link to the past"])

array(['Dark Link', 'Ganon', 'Zora River',
       'The Legend of Zelda: A Link to the Past'], dtype='<U80')

In [None]:
question_doc = nlp(question)