<a href="https://colab.research.google.com/github/kgpark88/nlp/blob/main/pylucene_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

참조사이트 : https://notebook.community/paulovn/ml-vm-notebook/vmfiles/IPNB/Examples/g%20Misc/20%20Pylucene

# Initialization
Importing lucene will bring into the Python context all the lucene namespace; from then all lucene modules can be imported (included the support Java modules)

In [None]:
import lucene

In [None]:
print(lucene.VERSION)

8.6.1


In [None]:
# We can check all the Lucene packages included in this distribution of Pylucene
for p in sorted(lucene.CLASSPATH.split(':')):
    print(p)

\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\antlr4-runtime-4.5.1-1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\asm-7.2.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\asm-commons-7.2.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\extensions.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\hppc-0.8.1.jar
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\lucene-analyzers-common-8.6.1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\lucene-analyzers-kuromoji-8.6.1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\lucene-analyzers-nori-8.6.1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\lucene-analyzers-stempel-8.6.1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py3.7-win-amd64.egg\lucene\lucene-backward-codecs-8.6.1.jar;c
\python37\lib\site-packages\lucene-8.6.1-py

### The first operation is always to initialize the lucene backend.  This only needs to be done once for each running Python process

In [None]:
if not lucene.getVMEnv():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# Tests
Let's test a few Lucene components

In [None]:
test_strings = (
    "PyLucene is a Python extension for accessing Java Lucene.",
    "Its goal is to allow you to use Lucene's text indexing and searching capabilities from Python.")

In [None]:
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute

def fetch_terms(obj):
    '''fetch all terms from a token list object, as strings'''
    termAtt = obj.getAttribute(CharTermAttribute.class_)
    try:
        obj.clearAttributes()
        obj.reset()
        while obj.incrementToken():
            yield termAtt.toString() 
    finally:
        obj.end()
        obj.close()

# Stemming

In [None]:
from lucene import JArray_char, JArray

from org.tartarus.snowball.ext import SpanishStemmer, EnglishStemmer

def stem(stemmer, word):
    # Add the word
    stemmer.setCurrent(JArray_char(word), len(word))
    # Fire stemming
    stemmer.stem()
    # Fetch the output (buffer & size)
    result = stemmer.getCurrentBuffer()
    l = stemmer.getCurrentBufferLength()
    return ''.join(result)[0:l]    

st = SpanishStemmer()
for w in ('haciendo', 'lunes', 'vino', 'lápiz'):
    print( w, '->', stem(st, w))

st = EnglishStemmer()
for w in ('making', 'Monday', 'came', 'pencil'):
    print( w, '->', stem(st, w))

haciendo -> hac
lunes -> lun
vino -> vin
lápiz -> lapiz
making -> make
Monday -> Monday
came -> came
pencil -> pencil


In [None]:
from java.io import StringReader

def tokenize( tk, data ):
    '''Send a string to a tokenizer and get back the token list'''
    tk.setReader( StringReader(data) )
    return list(fetch_terms(tk))

In [None]:
from org.apache.lucene.analysis.standard import StandardTokenizer
from org.apache.lucene.analysis.core import LetterTokenizer
from org.apache.lucene.analysis.ngram import NGramTokenizer

# Tokenizer
- StandardTokenizer : A grammar-based tokenizer constructed with JFlex. This class implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29.  
- LetterTokenizer : A tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate.  
- NGramTokenizer : Tokenizes the input into n-grams of the given size(s).

In [None]:
tokenizers = (StandardTokenizer(), LetterTokenizer(), NGramTokenizer(4, 4))

for n, t in enumerate(tokenizers):
    print( "\n{} -----------".format(n+1), str(t) )
    for s in test_strings:
        print( "\n", tokenize(t,s) )


1 ----------- StandardTokenizer@31ff43be term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1

 ['PyLucene', 'is', 'a', 'Python', 'extension', 'for', 'accessing', 'Java', 'Lucene']

 ['Its', 'goal', 'is', 'to', 'allow', 'you', 'to', 'use', "Lucene's", 'text', 'indexing', 'and', 'searching', 'capabilities', 'from', 'Python']

2 ----------- LetterTokenizer@7b205dbd term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1

 ['PyLucene', 'is', 'a', 'Python', 'extension', 'for', 'accessing', 'Java', 'Lucene']

 ['Its', 'goal', 'is', 'to', 'allow', 'you', 'to', 'use', 'Lucene', 's', 'text', 'indexing', 'and', 'searching', 'capabilities', 'from', 'Python']

3 ----------- NGramTokenizer@106cc338 term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1

 ['PyLu', 'yLuc', 'Luce', 'ucen', 'cene', 'ene ', 'ne i', 'e is', ' is ', 'is a', 's a ', ' a P', '

# Analyzer
- KeywordAnalyzer: "Tokenizes" the entire stream as a single token. This is useful for data like zip codes, ids, and some product names.  
- SimpleAnalyzer: An Analyzer that filters LetterTokenizer with LowerCaseFilter 
- SpanishAnalyzer: built from an StandardTokenizer filtered with StandardFilter, LowerCaseFilter, StopFilter, SetKeywordMarkerFilter if a stem exclusion set is provided and SpanishLightStemFilter.  
- ShingleAnalyzerWrapper: A ShingleAnalyzerWrapper wraps a ShingleFilter around another Analyzer. A shingle is another name for a token based n-gram.  

In [None]:
from java.io import StringReader
    
def analyze(anal, data):
    '''Send a string to an analizer and get back the analyzed term list'''
    ts = anal.tokenStream( "dummy", StringReader(data) )
    return list(fetch_terms(ts))

In [None]:
from org.apache.lucene.analysis.core import KeywordAnalyzer, SimpleAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.es import SpanishAnalyzer
from org.apache.lucene.analysis.shingle import ShingleAnalyzerWrapper

analyzers = ( KeywordAnalyzer(),
              SimpleAnalyzer(),
              SpanishAnalyzer(),
              ShingleAnalyzerWrapper( SimpleAnalyzer(), 2, 3 ),
              ShingleAnalyzerWrapper( SpanishAnalyzer(), 2, 3 ),
            )

for n, a in enumerate(analyzers):
    print( "\n {} ----------- {}".format(n+1, a) )
    for s in test_strings:
        print( "\n", analyze(a,s) )


 1 ----------- org.apache.lucene.analysis.core.KeywordAnalyzer@2631f68c

 ['PyLucene is a Python extension for accessing Java Lucene.']

 ["Its goal is to allow you to use Lucene's text indexing and searching capabilities from Python."]

 2 ----------- org.apache.lucene.analysis.core.SimpleAnalyzer@19835e64

 ['pylucene', 'is', 'a', 'python', 'extension', 'for', 'accessing', 'java', 'lucene']

 ['its', 'goal', 'is', 'to', 'allow', 'you', 'to', 'use', 'lucene', 's', 'text', 'indexing', 'and', 'searching', 'capabilities', 'from', 'python']

 3 ----------- org.apache.lucene.analysis.es.SpanishAnalyzer@a87f8ec

 ['pylucen', 'is', 'python', 'extension', 'for', 'accessing', 'java', 'lucen']

 ['its', 'goal', 'is', 'to', 'allow', 'you', 'to', 'use', "lucene's", 'text', 'indexing', 'and', 'searching', 'capabiliti', 'from', 'python']

 4 ----------- org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper@22356acd

 ['pylucene', 'pylucene is', 'pylucene is a', 'is', 'is a', 'is a python', 'a'

### 참조: https://pythonhosted.org/lupyne/index.html

# Lupyne is:
- high-level Pythonic search engine library, built on PyLucene  
- RESTful JSON search server, built on CherryPy  
- simple Python client for interacting with the server  

In [None]:
from lupyne import engine   # don't forget to call lucene.initVM

indexer = engine.Indexer()                             # create an in-memory index (no filename supplied)
indexer.set('name', stored=True)                     # create stored 'name' field
indexer.set('text', engine.Field.Text)                  # create indexed 'text' field (the default)
indexer.add(name='sample', text='hello world')  # add a document to the index
indexer.commit()                                         # commit changes; document is now searchable


In [None]:
hits = indexer.search('text:hello')             # run search and return sequence of documents 

In [None]:
hits

<lupyne.engine.documents.Hits at 0x21d682c2d48>

In [None]:
len(hits)

1

In [None]:
hits.count 

1

# indexers
Basic indexing and searching example adapted from http://lucene.apache.org/core/4_10_1/core/index.html

In [None]:
import lucene
from org.apache.lucene import analysis, document, index, queryparser, search, store, util
from lupyne import engine
lucene.initVM()

<jcc.JCCEnv at 0x21d682b45d0>

In [None]:
# # # lucene # # #

analyzer = analysis.standard.StandardAnalyzer()

In [None]:
# Store the index in memory:
directory = store.RAMDirectory()

# To store an index on disk, use this instead:
# Directory directory = FSDirectory.open(File("/tmp/testindex"))
config = index.IndexWriterConfig( analyzer)
iwriter = index.IndexWriter(directory, config)
doc = document.Document()
text = "This is the text to be indexed."
doc.add(document.Field("fieldname", text, document.TextField.TYPE_STORED))
iwriter.addDocument(doc)
iwriter.close()

In [None]:
# Now search the index:
# ireader = index.IndexReader.open(directory)
ireader = index.DirectoryReader.open(directory)

In [None]:
isearcher = search.IndexSearcher(ireader)

In [None]:
# Parse a simple query that searches for "text":
# parser = queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT, "fieldname", analyzer)
parser = queryparser.classic.QueryParser("fieldname", analyzer)
query = parser.parse("text")

In [None]:
hits = isearcher.search(query, 1000).scoreDocs
assert len(hits) == 1

# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text
ireader.close()
directory.close()

# queries
- Convenient Query creation.  
- Operator overloading is used for combining boolean clauses.

In [None]:
import lucene
from org.apache.lucene import index, search
from org.apache.lucene.search import spans
from lupyne.engine import Query
lucene.initVM()

<jcc.JCCEnv at 0x21d68568810>

In [None]:
# # # lucene # # #

q1 = search.TermQuery(index.Term('text', 'lucene'))
q2 = search.PhraseQuery()
q2.add(index.Term('text', 'search'))
q2.add(index.Term('text', 'engine'))
q3 = search.BooleanQuery()
q3.add(q1, search.BooleanClause.Occur.MUST)
q3.add(q2, search.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = spans.SpanTermQuery(index.Term('text', 'hello'))
q2 = spans.SpanTermQuery(index.Term('text', 'world'))
q3 = spans.SpanPositionRangeQuery(q1, 0, 10)
q4 = spans.SpanNearQuery([q1, q2], 0, True)
q5 = spans.SpanNotQuery(q3, q4)
assert str(q5) == 'spanNot(spanPosRange(text:hello, 0, 10), spanNear([text:hello, text:world], 0, true), 0, 0)'


InvalidArgsError: (<class 'org.apache.lucene.search.PhraseQuery'>, '__init__', ())