In [1]:
from dash import jupyter_dash

jupyter_dash.default_mode="external"

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import matplotlib.pyplot as plt

import pandas as pd
import pyterrier as pt
import re


The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
# (Optional) Pre-process the dataset if feasible
doc_info_dict = {}
for doc in dataset.get_corpus_iter():
  doc_info_dict[doc['docno']] = {"title": doc.get("title", "No Title Available"), "abstract": doc.get("abstract", "")}


cord19/trec-covid documents: 100%|██████████| 192509/192509 [00:01<00:00, 179647.33it/s]


In [6]:
# !rm -rf ./cord19-index

In [4]:
indexer = pt.index.IterDictIndexer('./cord19-index') # initialize an indexer object
indexref = indexer.index(dataset.get_corpus_iter(), fields=('title', 'abstract'))
index = pt.IndexFactory.of(indexref)
BM25_br = pt.BatchRetrieve(index, wmodel="BM25")

cord19/trec-covid documents:   1%|          | 1941/192509 [00:03<01:55, 1649.76it/s]



cord19/trec-covid documents: 100%|██████████| 192509/192509 [00:34<00:00, 5553.30it/s]


11:01:52.439 [ForkJoinPool-1-worker-3] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key 8lqzfj2e is not unique: 37597,11755
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.indexDocuments(BasicIndexer.java:270)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:388)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:377)
	at org.terrier.python.ParallelIndexer$3.apply(ParallelIndexer.java:131)
	at org.terrier.python.ParallelIndexer$3.apply(ParallelIndexer.java:120)
	at java.

In [8]:
# !rm -rf ./cord19-index_v2

In [5]:
indexer_v2 = pt.index.IterDictIndexer('./cord19-index_v2',stemmer= None, stopwords = None) # initialize an indexer object
indexref_v2 = indexer_v2.index(dataset.get_corpus_iter(), fields=('title', 'abstract'))
index_v2 = pt.IndexFactory.of(indexref_v2)
BM25_br_vanilla = pt.BatchRetrieve(index_v2, wmodel="BM25")

cord19/trec-covid documents:   7%|▋         | 14416/192509 [00:02<00:26, 6652.53it/s]



cord19/trec-covid documents: 100%|██████████| 192509/192509 [00:32<00:00, 5874.03it/s]

11:02:29.320 [ForkJoinPool-2-worker-3] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key 8lqzfj2e is not unique: 37597,11755
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.indexDocuments(BasicIndexer.java:270)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:388)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:377)
	at org.terrier.python.ParallelIndexer$3.apply(ParallelIndexer.java:131)
	at org.terrier.python.ParallelIndexer$3.apply(ParallelIndexer.java:120)
	at java.




11:02:33.816 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 56 empty documents


In [6]:
from deep_model_utils import DeepModel

model_dir = 'ceng596'
model_hub = {}
for model_name in ['all-MiniLM-L6-v2', 'all-MiniLM-L12-v2', 'all-distilroberta-v1']:
    m = DeepModel(model_path=f'{model_dir}/{model_name}', embeddings_path=f'{model_dir}/{model_name}.npy', documents_path=f'{model_dir}/documents.json')
    model_hub[model_name] = m

In [7]:
m.retrieve_top_documents('covid', 10)

['ovxmy1as',
 'm2k6usaz',
 '2onwu92a',
 'tq0dzwy1',
 'w86qc3fq',
 '9tybviyk',
 'tycggzr3',
 'zp4uy1v7',
 'gvfooevu',
 'b2d7t7ge']

In [8]:

bo1 = pt.rewrite.Bo1QueryExpansion(index)

# Create the pipeline for query expansion and retrieval
pipelineQE = BM25_br >> bo1 >> BM25_br

# Define the Dash application
app = dash.Dash(__name__)
app.layout = html.Div(
    style={"text-align": "center", "font-family": "Arial, sans-serif"},
    children=[
        html.H1("BingBuster's Search Engine", style={"margin-bottom": "20px", "color": "#333"}),
        html.Div(
            style={"margin-bottom": "20px"},
            children=[
                html.Label("Enter your keyword:  ", style={"font-weight": "bold", "font-size": "16px"}),
                dcc.Input(id="keyword-input", type="text", value="", style={"width": "50%", "padding": "10px", "font-size": "16px", "border-radius": "10px"}),
                html.Label("Results per page:", style={"font-size": "14px", "margin-left": "20px", "margin-right": "10px"}),
                dcc.Input(id="result-limit-input", type="number", value=10, min=1, max=50, style={"width": "60px", "padding": "5px", "font-size": "14px"}),
            ],
        ),
        html.Div(
            children=[
                html.Label("Retrieval Model:", style={"font-size": "14px", "margin-right": "10px"}),
                dcc.RadioItems(
                    id="model-choice",
                    options=[
                        {"label": "BM25_br", "value": "BM25_br"},
                        {"label": "BM25_br_vanilla", "value": "BM25_br_vanilla"},
                        {"label": "BM25_br_QE", "value": "BM25_br_QE"},
                        *[{"label": f'Vector similarity ({k})', "value": k} for k in model_hub.keys()]
                    ],
                    value="BM25_br",  # Default selection
                    labelStyle={"display": "inline-block", "margin-right": "10px", "font-size": "14px"},
                ),
            ],
            style={"margin-bottom": "20px"},
        ),
        html.Div(
            children=[
                html.Label("Show Abstracts:", style={"font-size": "14px", "margin-left": "20px", "margin-right": "10px"}),
                dcc.Checklist(
                    id="show-abstracts",
                    options=[{"label": " ", "value": "show"}],
                    value=[],
                    style={"display": "inline-block", "margin-right": "10px", "font-size": "14px"},
                ),
            ],
            style={"margin-bottom": "20px"},
        ),
        html.Button(
            id="search-button",
            children="Search",
            n_clicks=0,
            style={
                "background-color": "#4CAF50",  # Green color
                "color": "white",
                "border": "none",  # Remove border
                "padding": "10px 20px",
                "font-size": "16px",
                "cursor": "pointer",
                "border-radius": "5px",  # Rounded corners
            },
        ),
        html.Div(id="expanded-query", children=[], style={"text-align": "left", "margin-top": "20px", "font-size": "16px"}),
        html.Div(id="search-results", children=[], style={"text-align": "left", "margin-top": "20px", "font-size": "16px"}),
    ],
)

# Callback with error handling and informative messages
@app.callback(
    [Output(component_id="search-results", component_property="children"),
     Output(component_id="expanded-query", component_property="children")],
    [Input(component_id="search-button", component_property="n_clicks")],
    [
        Input(component_id="keyword-input", component_property="value"),
        Input(component_id="result-limit-input", component_property="value"),
        Input(component_id="model-choice", component_property="value"),
        Input(component_id="show-abstracts", component_property="value"),
    ],
)
def update_results(n_clicks, keyword, result_limit, selected_model, show_abstracts):
    if n_clicks == 0 or not keyword:
        return [], ""  # Handle initial state and empty input

    expanded_query = ""
    # Retrieve documents based on selected model
    try:
        if selected_model == "BM25_br":
            top_docs = retrieve_top_documents(keyword, BM25_br, result_limit)
        elif selected_model == "BM25_br_vanilla":
            top_docs = retrieve_top_documents(keyword, BM25_br_vanilla, result_limit)
        elif selected_model == "BM25_br_QE":
            # Perform query expansion
            display_results = (BM25_br >> bo1).search(keyword)
            expanded_query = display_results['query'].iloc[0]
            # Remove weights and applypipeline
            expanded_query = re.sub(r'\^\d+\.\d+', '', expanded_query)
            expanded_query = re.sub(r'applypipeline:off ', '', expanded_query)
            top_docs = retrieve_top_documents(expanded_query, BM25_br, result_limit)
        elif selected_model in model_hub:
            top_docs = model_hub[selected_model].retrieve_top_documents(keyword, result_limit)            

        if not top_docs:
            return "No relevant documents found.", ""

        # Format results as HTML list
        results_list = html.Ol(
            children=[
                html.Li(
                    children=[
                        html.Span(children=f"{doc_info_dict[doc]['title']}"),
                        html.Ul(
                            children=[
                                html.Li(children=f"{doc_info_dict[doc]['abstract']}")
                            ],
                            style={"margin-left": "20px", "display": "block" if "show" in show_abstracts else "none"}
                        )
                    ],
                    style={"margin-bottom": "5px"}
                )
                for doc in top_docs[:result_limit]
            ]
        )
        return results_list, f"Expanded Query: {expanded_query}" if expanded_query else ""
    except Exception as e:
        return f"Error: {str(e)}", ""

# Function to retrieve top documents using PyTerrier (assuming you have a defined retrieval function)
def retrieve_top_documents(keyword, retrieval_model, result_limit):
    top_docs = retrieval_model.search(query=keyword).head(result_limit)  # Retrieve top based on result_limit
    # Extract a list of docids
    doc_ids = top_docs["docno"].tolist()
    return doc_ids  # Return the list of docids

if __name__ == "__main__":
    app.run_server(debug=True)


Dash app running on http://127.0.0.1:8050/




In [None]:
# http://127.0.0.1:8050/