In [1]:
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')


## 1

### Files

In [2]:
# paths
titles_path = 'Files/titles.pkl'
embeddings_path = 'Files/embeddings.pkl'
authors_path = 'Files/authors.pkl'
years_path = 'Files/years.pkl'
summary_path = 'Files/summary.pkl'

In [3]:
# load numpy array from pickle file
with open('Files/titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('Files/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

with open('Files/authors.pkl', 'rb') as f:
    authors = pickle.load(f)

with open('Files/years.pkl', 'rb') as f:
    years = pickle.load(f)

with open('Files/summary.pkl', 'rb') as f:
    summary = pickle.load(f)

In [4]:
# convert to numpy array
titles = np.array(titles)
embeddings = np.array(embeddings)
authors = np.array(authors)
years = np.array(years)
summary = np.array(summary)

In [5]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [6]:
# save the model to disk as a pickle file
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename , 'rb'))

In [7]:
sample = 'What is the best way to learn Python?'
#print(model.encode([sample]))
#print(loaded_model.encode([sample]))
print('---------------------' * 3)
print(len(model.encode([sample])[0]))
print(len(loaded_model.encode([sample])[0]))

---------------------------------------------------------------
384
384


----

In [8]:
nlist = 50
m = 8
bits = 8
dimensions = embeddings.shape[1]

quantizer = faiss.IndexFlatIP(dimensions)
index = faiss.IndexIVFPQ(quantizer, dimensions, nlist, m, bits)
print(index.is_trained)

False


In [9]:
index.train(embeddings)
index.add(embeddings)
print(index.ntotal)
print(index.is_trained)

: 

: 

In [13]:
# save the index to disk as a pickle file
index_filename = 'index.sav'
pickle.dump(index, open(index_filename, 'wb'))

# load the index from disk
loaded_index = pickle.load(open(index_filename , 'rb'))

In [14]:
%%time
k = 5
D, I = index.search(model.encode([str(input("Enter Query: "))]), k)
print(D)

[[1.174143  1.2508657 1.2596219 1.2784854 1.2793882]]
CPU times: user 109 ms, sys: 29.1 ms, total: 138 ms
Wall time: 4.77 s


In [15]:
print(I)

[[15695  7873 15016 18628 10918]]


In [16]:
[f'{i}: {titles[i]}' for i in I[0]]

['15695: FNTF: First No-reference Then Full-reference image quality assessment using Dark Channel.',
 '7873: Rectification of figures and photos in document images using bounding box interface.',
 '15016: Design and Implementation of QoS-driven Dynamic Slot Assignment and Piconet Partitioning Algorithms over Bluetooth WPANs.',
 '18628: PiCode: 2D barcode with embedded picture and ViCode: 3D barcode with embedded video.',
 '10918: MANTRA: Minimum Maximum Latent Structural SVM for Image Classification and Ranking.']

### Testing

#### 1

In [16]:
# sample function to get the top k similar documents, with default k=5
def get_similar_documents(query, index, model, titles, k=5):
    D, I = index.search(model.encode([query]), k)
    return [titles[i] for i in I[0]]

get_similar_documents('What is the best way to learn Python?', loaded_index, loaded_model, titles)

['Learning Where You Are Going and from Whence You Came: h- and g-Cost Learning in Real-Time Heuristic Search.',
 'Learning by Reading: A Prototype System, Performance Baseline and Lessons Learned.',
 'Integrating learning objects into an open learning environment: evaluation of learning processes in an informatics learning lab.',
 'Typing Tutor: Individualized Tutoring in Text Entry for Older Adults Based on Input Stumble Detection.',
 'Analogy Tutor: A Tutoring System for Promoting Conceptual Learning via Comparison.']

In [17]:
def get_similar_documents(query, index, model, titles, k=5):
    D, I = index.search(model.encode([query]), k)
    for i in I:
        print(titles[i])

In [20]:
query = str(input("Enter Query: "))
get_similar_documents(query, loaded_index, loaded_model, titles)

['Bits of History, Challenges for the Future and Autonomic Computing Technology.'
 'Empirically Studying Software Practitioners - Bridging the Gap between Theory and Practice.'
 'The evolution of C programming practices: a study of the Unix operating system 1973-2015.'
 'A living laboratory for the design and evaluation of ubiquitous computing technologies.'
 '15 Years of Application of Statistical Physics Methods to the Study of Software Systems.']


----

#### 2

In [22]:
import pickle


# load numpy arrays from pickle file
""" with open('titles_np.pkl', 'rb') as f:
    titles = pickle.load(f) """

'''with open('embeddings_np.pkl', 'rb') as f:
    embeddings = pickle.load(f)'''

# load the model
#model = pickle.load(open('model.sav' , 'rb'))

# load the index from disk
#index = pickle.load(open('index.sav' , 'rb'))

def get_embeddings(query):
    xq = model.encode([query])
    return xq


def get_similar_documents(query, index, model, titles, k=5):
    xq = get_embeddings(str(query))
    D, I = index.search(xq, k)
    for i in I:
        print(titles[i])

query = str(input("Enter Query: "))
get_similar_documents(query, index, model, titles)

#get_similar_documents('What is the best way to learn Python?', index, model, titles)

['Brief Announcement: Fast and Simple Node Coloring in the SINR Model.'
 'The Dirac-Motzkin Problem on Ordinary Lines and the Orchard Problem (Invited Talk).'
 'Compressed suffix arrays and suffix trees with applications to text indexing and string matching (extended abstract).'
 'Linear-Time Algorithms for Two Subtree-Comparison Problems on Phylogenetic Trees with Different Species.'
 'Minimal Tail-Biting Trellises for Certain Cyclic Block Codes Are Easy to Construct.']


----

#### 3

In [10]:
with open('titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


# convert to numpy array
titles = np.array(titles)
embeddings = np.array(embeddings)

# ----------------- model  -----------------
#model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

# load model
model = pickle.load(open('model.sav' , 'rb'))

# ----------------- faiss_index  -----------------
nlist = 50
m = 8
bits = 8
dimensions = embeddings.shape[1]

quantizer = faiss.IndexFlatIP(dimensions)
index = faiss.IndexIVFPQ(quantizer, dimensions, nlist, m, bits)
index.train(embeddings)
index.add(embeddings)

# ----------------- search  -----------------

def get_embeddings(query):
    xq = model.encode([query])
    return xq

def get_similar_documents(query, index=index, model=model, titles=titles, k=5):
    xq = get_embeddings(str(query))
    D, I = index.search(xq, k)
   
    print(titles[I[0][0]])


query = 'Quavo'
get_similar_documents(query)


Finding Galaxies in the Shadows of Quasars with Gaussian Processes.


In [19]:
def get_embeddings(query):
    xq = model.encode([query])
    return xq

In [46]:
def get_similar_documents(query):
    k = 10
    xq = get_embeddings(str(query))
    D, I = index.search(xq, k)
    # return list of similar documents
    return [titles[i] for i in I[0]]

In [2]:
# test funct.
def name(text):
    return 'Hello ' + text, 'Hello ' + text

In [48]:
print(name('John'))
print(get_similar_documents('maths'))

('Hello John', 'Hello John')
['Floats and Ropes: A Case Study for Formal Numerical Program Verification.', 'Digital Libraries: Extending and Applying Library and Information Science and Technology.', 'The fractal dimension metric and its use to assess object-oriented software quality.', 'UX research: what theoretical roots do we build on - if any?', "Don't look now, but we've created a bureaucracy: the nature and roles of policies and rules in wikipedia.", 'Academic Software Engineering: What Is and What Could Be? Results of the First Annual Survey for International SE Programs.', "Design requirements for more flexible structured editors from a study of programmers' text editing.", 'Ten steps of integrating user feedback into the product definition process: a closed loop approach.', 'Using knowledge elicitation to improve Web effort estimation: Lessons from six industrial case studies.', 'Java Program Analysis Projects in Osaka University: Aspect-Based Slicing System ADAS and Ranked-Co

#### Deployment

In [None]:
import gradio as gr

iface = gr.Interface(
    fn=get_similar_documents,
    inputs = gr.Textbox(lines=1, placeholder="Enter Query...", label="Query"),

    # variable number of outputs based on the number of results argument 
    outputs = [ gr.outputs.Textbox(label="First similar document"),
                gr.outputs.Textbox(label="Second similar document"),
                gr.outputs.Textbox(label="Third similar document"),
                gr.outputs.Textbox(label="Fourth similar document"),
                gr.outputs.Textbox(label="Fifth similar document"),
                gr.outputs.Textbox(label="Sixth similar document"),
                gr.outputs.Textbox(label="Seventh similar document"),
                gr.outputs.Textbox(label="Eighth similar document"),
                gr.outputs.Textbox(label="Ninth similar document"),
                gr.outputs.Textbox(label="Tenth similar document")],
   
    title="Search Engine",
    description="Search Engine using Sentence Transformers and Faiss Indexing",
    allow_flagging=False,
    allow_screenshot=False,
    allow_output_caching=False,

    )
iface.launch()

## 2, (desiging retrerival)

In [2]:
with open('Files/titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('Files/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

with open('Files/authors.pkl', 'rb') as f:
    authors = pickle.load(f)

with open('Files/years.pkl', 'rb') as f:
    years = pickle.load(f)

with open('Files/summary.pkl', 'rb') as f:
    summary = pickle.load(f)

index = pickle.load(open('Files/index.sav' , 'rb'))

#model = pickle.load(open('Files/model.sav' , 'rb'))
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [4]:
print(index.ntotal)
print(index.is_trained)

28000
True


In [5]:
# simple query to return the first 5 similar documents (Titles, Authors, Years, Summary)

def retrieve(query, k=5):
    xq = model.encode([query])
    D, I = index.search(xq, k)
    return [titles[i] for i in I[0]], [authors[i] for i in I[0]], [years[i] for i in I[0]], [summary[i] for i in I[0]]

In [15]:
#results = retrieve('maths')
results = retrieve(str(input("Enter Query: ")))

# print Title, Author, Year, Summary of the first 5 similar documents
for i in range(len(results[0])):
    print(results[0][i]," by: " , results[1][i], results[2][i])
    print("-------------------------" * 4)
    print(results[3][i])
    print()
    print("=========================" * 4)

Recent Advances of Blockchain and its Applications  by:  Weili Wu 2022
----------------------------------------------------------------------------------------------------
Blockchain is an emerging decentralized data collection, sharing and storage
technology, which have provided abundant transparent, secure, tamper-proof,
secure and robust ledger services for various real-world use cases. Recent
years have witnessed notable developments of blockchain technology itself as
well as blockchain-adopting applications. Most existing surveys limit the
scopes on several particular issues of blockchain or applications, which are
hard to depict the general picture of current giant blockchain ecosystem. In
this paper, we investigate recent advances of both blockchain technology and
its most active research topics in real-world applications. We first review the
recent developments of consensus mechanisms and storage mechanisms in general
blockchain systems. Then extensive literature is conducted o