In [1]:
import pickle
import numpy as np
import json
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')


In [2]:
with open('Files/titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('Files/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

with open('Files/authors.pkl', 'rb') as f:
    authors = pickle.load(f)

with open('Files/years.pkl', 'rb') as f:
    years = pickle.load(f)

with open('Files/summary.pkl', 'rb') as f:
    summary = pickle.load(f)

index = pickle.load(open('Files/index.sav' , 'rb'))

#model = pickle.load(open('Files/model.sav' , 'rb'))
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [9]:
# simple query to return the first 5 similar documents (Titles, Authors, Years, Summary)

def retrieve(query, k=5):
    xq = model.encode([query])
    D, I = index.search(xq, k)

    results = []
    for i in range(k):
        results.append(
            { 
                'Title': titles[I[0][i]],
                'Author': authors[I[0][i]],
                'Year': years[I[0][i]],
                'Summary': summary[I[0][i]]
            }
        )
    
    return results

In [12]:
# testing 😀

query = 'What is the best way to learn Python?'

results = retrieve(query)

""" for result in results:
    print(result['Title'])
    print(result['Author'])
    print(result['Year'])
    print(result['Summary'])
    print('') """

" for result in results:\n    print(result['Title'])\n    print(result['Author'])\n    print(result['Year'])\n    print(result['Summary'])\n    print('') "

In [13]:
def write_json(data):
    base_path = 'Sample_jsons/'
    excportpath = base_path + 'excport.json'
    with open(excportpath, 'w') as f:
        json.dump(data, f)
    
    print('JSON file created successfully')
    return excportpath

In [14]:
# testing 😀
write_json(results)

JSON file created successfully


'Sample_jsons/excport.json'

-------

In [None]:
# write list of dictionaries to json file
with open('results3.json', 'w') as f:
    json.dump(results, f)

----
## Gardio app

In [5]:
import gradio as gr

interface = gr.Interface(
    fn=retrieve,
    inputs = gr.inputs.Textbox(lines=1, placeholder="Enter Query...", label="Query text"),
    # a json output with 4 keys: titles, authors, years, summary
    outputs = gr.outputs.JSON(label="Similar Documents"),
    title="Semantic Search",
    description="Search for similar documents using semantic search.",
    allow_flagging=False,
    examples=[
        ["Mathematical models of the spread of infectious diseases in humans and animals"],
        ["A new method for solving the nonlinear eigenvalue problem"]
    ]
)

In [6]:
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x7f800075b4f0>, 'http://127.0.0.1:7860/', None)

-----

-----
## JSON Out

In [5]:
# convert dict to json for HTML output
import json

x = json.dumps(results, indent = 4) 
print(x)

{
    "Result 1": [
        "Integrative Imaging Informatics for Cancer Research: Workflow Automation\n  for Neuro-oncology (I3CR-WANO)",
        "Daniel S. Marcus",
        2022,
        "Efforts to utilize growing volumes of clinical imaging data to generate tumor\nevaluations continue to require significant manual data wrangling owing to the\ndata heterogeneity. Here, we propose an artificial intelligence-based solution\nfor the aggregation and processing of multisequence neuro-oncology MRI data to\nextract quantitative tumor measurements. Our end-to-end framework i) classifies\nMRI sequences using an ensemble classifier, ii) preprocesses the data in a\nreproducible manner, iii) delineates tumor tissue subtypes using convolutional\nneural networks, and iv) extracts diverse radiomic features. Moreover, it is\nrobust to missing sequences and adopts an expert-in-the-loop approach, where\nthe segmentation results may be manually refined by radiologists. Following the\nimplementation of 

In [6]:
# save json to file
with open('HTML/results.json', 'w') as f:
    json.dump(results, f)

----

## Reserves



```python

def retrieve(query, k=5):
    xq = model.encode([query])
    D, I = index.search(xq, k)

    results = {'titles': [], 'authors': [], 'years': [], 'summary': []}
    for i in range(k):
        results['titles'].append(titles[I[0][i]])
        results['authors'].append(authors[I[0][i]])
        results['years'].append(years[I[0][i]])
        results['summary'].append(summary[I[0][i]])
    
    return results
    #return [titles[i] for i in I[0]], [authors[i] for i in I[0]], [years[i] for i in I[0]], [summary[i] for i in I[0]]


for i in range(len(results['titles'])):
    print(results['titles'][i])
    print(results['authors'][i])
    print(results['years'][i])
    print(results['summary'][i])
    print('----------------------------------------' * 2)


# write each item in the list to a json file on new line
with open('results2.json', 'w') as f:
    for item in results:
        json.dump(item, f)
        f.write('\n' % item)


# simple query to return the first 5 similar documents (Titles, Authors, Years, Summary)
def retrieve(query, k=5):
    xq = model.encode([query])
    D, I = index.search(xq, k)

    results = {'Result 1': [], 'Result 2': [], 'Result 3': [], 'Result 4': [], 'Result 5': []}
    for i in range(k):
        results['Result '+str(i+1)].append(titles[I[0][i]])
        results['Result '+str(i+1)].append(authors[I[0][i]])
        results['Result '+str(i+1)].append(years[I[0][i]])
        results['Result '+str(i+1)].append(summary[I[0][i]])
    return results


## Draft for json output
k = 5 # number of similar documents to return
xq = model.encode(['Africa Technology'])
D, I = index.search(xq, k)

# store all dictionaries in a list
results = []

for i in range(k):
    results.append({
        'Title': titles[I[0][i]],
        'Author': authors[I[0][i]],
        'Year': years[I[0][i]],
        'Summary': summary[I[0][i]]
    })

# write list of dictionaries to json file
with open('results3.json', 'w') as f:
    json.dump(results, f)

```