# PyLucene

This demonstrates using Lucene to index files and search them

To setup PyLucene:
```
conda activate nlu
cd pylucene-8.1.1
conda install jcc
make
make install
```

In [22]:
import sys, os, lucene, threading, time
from datetime import datetime

from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser

In [23]:
INDEX_DIR = "indexes/licenses"
DATA_DIR = "data/licenses"

In [3]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7f606002b2f0>

In [4]:
class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

In [5]:
"""                                                                                                                           
This class is loosely based on the Lucene (java implementation) demo class                                                    
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument                                                   
and will index all of the files in that directory and downward recursively.                                                   
It will index on the file path, the file name and the file contents.  The                                                     
resulting Lucene index will be placed in the current directory and called                                                     
'index'.                                                                                                                      
"""
class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index',)
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
        
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print("adding %s" % filename)
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = file.read()
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                except Exception as e:
                    print("Failed in indexDocs: %s" % e)


In [6]:
def index_test1():
    print('lucene version %s' % lucene.VERSION)
    start = datetime.now()
    try:
        IndexFiles(DATA_DIR, INDEX_DIR, StandardAnalyzer())
        end = datetime.now()
        print('Elapsed: %s' % (end - start))
    except Exception as e:
        print("Failed: %s" % e)
        raise e    

In [7]:
index_test1()

lucene version 8.1.1
adding apache1.0.txt
adding lpgl2.0.txt
adding mozilla1.1.txt
adding cpl1.0.txt
adding epl1.0.txt
adding mozilla_eula_thunderbird2.txt
adding gpl3.0.txt
adding freebsd.txt
adding mit.txt
adding gpl2.0.txt
adding mozilla_eula_firefox3.txt
adding apache1.1.txt
adding apache2.0.txt
adding lgpl2.1.txt
adding gpl1.0.txt
adding lgpl3.txt
commit index
.done
Elapsed: 0:00:00.175316


In [20]:
def search_loop():
    searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(Paths.get(INDEX_DIR))))
    analyzer = StandardAnalyzer()
    print("Hit enter with no input to quit.")
    while True:
        command = input("Query:")
        if command == '':
            return
        print("Searching for: %s" % command)
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print('path:', doc.get("path"), 'name:', doc.get("name"))


In [24]:
search_loop()

Hit enter with no input to quit.
Query:property
Searching for: property
9 total matching documents.
path: data/licenses name: cpl1.0.txt
path: data/licenses name: epl1.0.txt
path: data/licenses name: mozilla1.1.txt
path: data/licenses name: mozilla_eula_thunderbird2.txt
path: data/licenses name: mozilla_eula_firefox3.txt
path: data/licenses name: gpl2.0.txt
path: data/licenses name: lpgl2.0.txt
path: data/licenses name: lgpl2.1.txt
path: data/licenses name: gpl3.0.txt
Query:free
Searching for: free
13 total matching documents.
path: data/licenses name: gpl1.0.txt
path: data/licenses name: gpl2.0.txt
path: data/licenses name: lgpl2.1.txt
path: data/licenses name: lpgl2.0.txt
path: data/licenses name: gpl3.0.txt
path: data/licenses name: lgpl3.txt
path: data/licenses name: mozilla_eula_firefox3.txt
path: data/licenses name: mit.txt
path: data/licenses name: mozilla_eula_thunderbird2.txt
path: data/licenses name: apache2.0.txt
path: data/licenses name: cpl1.0.txt
path: data/licenses name: