In [1]:
import lucene
from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import IndexSearcher, Explanation
from org.apache.lucene.search.similarities import TFIDFSimilarity, LMJelinekMercerSimilarity;
from org.apache.lucene.index import IndexReader,DirectoryReader,TermsEnum,Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory, FSDirectory
from org.apache.lucene.util import Version, BytesRefIterator
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer 
from org.apache.lucene.queryparser.flexible.standard import StandardQueryParser
from sklearn.utils import check_random_state
from functools import partial
import numpy as np
from java.io import File
from org.apache.lucene.analysis.en import EnglishAnalyzer
import lime
from lime import lime_ranker
import re

In [2]:



# Load the samples
index_path = '/Users/manishav/workspace/irexplain/trec_index'
samples_path = '/Users/manishav/workspace/irexplain/samples/pointwise/samples_tfidf.txt'



def load_samples(file_path):
    samples_list = {}
    with open(file_path, 'r') as ifile:
        for line in ifile:
            split = line.split('\t')
            doc_id = split[1][:split[1].rindex('_')]
            if doc_id not in samples_list:
                samples_list[doc_id] = []
            try:
                samples_list[doc_id].append({'query_id':split[0], 'sample_id':split[1], 'sample_text': split[3].strip(),\
                                         'sample_score': float(split[4])})
            except:
                print(split)
    return samples_list

#[\\\\/:*?"<>|]

def tokenize_text(text):
    analyzer = EnglishAnalyzer()
    parser = StandardQueryParser(analyzer)
    parsed_text = parser.parse(text,'').toString('')
    parsed_text = re.sub('[)()]', '', parsed_text)
    return parsed_text
        

In [3]:
ranker_explanation = lime_ranker.LimeRankerExplainer(1, None, True, [0,1,2,3,4], index_path, 1)

In [4]:
samples = load_samples(samples_path)

['449 ', 'FBIS4-23025_172', '', 'spokesman']


In [5]:
for doc_id, sample_list in samples.items():
    sample_scores = [x['sample_score'] for x in sample_list]
    sample_texts = [tokenize_text(x['sample_text']) for x in sample_list]
    explain_object = ranker_explanation.explain_document_label(doc_id, sample_texts, sample_scores, 10)

dict_keys(['0091', '1', '10', '130', '182123', '1990', '27', '30', '77', 'accid', 'ad', 'american', 'author', 'bu', 'carri', 'center', 'citi', 'column', 'desk', 'determin', 'diplomat', 'edit', 'embank', 'februari', 'foot', 'foreign', 'germani', 'home', 'hurt', 'immedi', 'injur', 'injuri', 'la022790', 'leav', 'militari', 'minor', 'oberwesel', 'page', 'part', 'passeng', 'personnel', 'plung', 'polic', 'recreat', 'rel', 'report', 'rhine', 'river', 'road', 'spokesman', 'staff', 'suffer', 'time', 'tour', 'tourism', 'tourist', 'tuesdai', 'twenti', 'u.', 'west', 'wire', 'word', 'world', 'youth'])
'2790' is not in list  :: Sample 2790 0091 youth passeng staff diplomat center 182123 77 77 182123 wire suffer
'uari' is not in list  :: Sample west 77 77 recreat 30 recreat author uari river foot recreat desk
'2790' is not in list  :: Sample accid injur passeng ad minor la022790 0091 oberwesel 2790 0091 germani recreat recreat part
'2790' is not in list  :: Sample home 2790 0091 0091 1990 la022790 00

ValueError: Found input variables with inconsistent numbers of samples: [51, 50]

In [None]:
samples['LA022790-0091'][2]

In [None]:
lucene.initVM(classpath=lucene.CLASSPATH, vmargs=['-Djava.awt.headless=true'])
analyzer = StandardAnalyzer()
print(lucene.CLASSPATH)

In [None]:
tokenize_text('2790-0091 youths passengers staff diplomatic center 182123 77 77 182123 wire suffered')

In [None]:
indexPath = File(index_path).toPath()

In [None]:
index_dir = FSDirectory.open(indexPath)

In [None]:
reader = DirectoryReader.open(index_dir)

In [None]:
searcher = IndexSearcher(reader)

In [None]:
text_field = 'words'
analyzer = WhitespaceAnalyzer()
query_parser = QueryParser('id', analyzer)
score_docs = searcher.search(query_parser.parse(str('LA022790-0091')),1).scoreDocs

In [None]:
import math
tc_dict = {}                     # Counts of each term
dc_dict = {}                     # Number of docs associated with each term
tfidf_dict = {}   
if len(score_docs) > 0:
    # get the tf-idf vector.
    termVector = reader.getTermVector(score_docs[0].doc, text_field);
    termsEnumvar = termVector.iterator()
    termsref = BytesRefIterator.cast_(termsEnumvar)
    N_terms = 0
    try:
        while (termsref.next()):
            termval = TermsEnum.cast_(termsref)
            fg = termval.term().utf8ToString()       # Term in unicode
            tc = termval.totalTermFreq()             # Term count in the doc

            # Number of docs having this term in the index
            dc = reader.docFreq(Term(text_field, termval.term())) 
            N_terms = N_terms + 1 
            tc_dict[fg]=tc
            dc_dict[fg]=dc
    except:
        print('error in term_dict')

    # Compute TF-IDF for each term
    for term in tc_dict:
        tf = tc_dict[term] / N_terms
        idf = 1 + math.log(reader.numDocs()/(dc_dict[term]+1)) 
        tfidf_dict[term] = tf*idf


In [None]:
tfidf_dict