In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
import pickle

In [2]:
stop_words = set(stopwords.words("english"))
def tokenize_clean(document):
    return [w.lower() for w in word_tokenize(document) if w.lower() not in stop_words and w.isalpha()]

In [3]:
my_dict = Dictionary.load_from_text("search_dict.txt")
my_tfidf = TfidfModel.load("model_tfidf.tfidf")
index = SparseMatrixSimilarity.load("model_index.index")
with open("raw_documents.pkl", "rb") as file:
    raw_documents = pickle.load(file)

In [4]:
def generate_results(query, to_return = 10):
    tokens = tokenize_clean(query)
    sims = index[my_tfidf[my_dict.doc2bow(tokens)]]
    sims_pairs = sorted(enumerate(sims), reverse=True, key = lambda x: x[1])
    top_docs = []
    for (i, score) in sims_pairs[:to_return]:
        top_docs.append(raw_documents[i][:100])
    return top_docs

In [5]:
generate_results("robert")

['Robert Curthose (–1134), sometimes styled Robert II or Robert III, was the Duke of Normandy from 108',
 'Robert the Magnificent () († 1035), was the sixth Duke of Normandy from 1027 until he died returning',
 'Robert Johnson may mean:\n\n In politics\nRobert Johnson (governor), South Carolina\nRobert Johnson (Tex',
 'Robert I of Scotland (11 July 1274 – 7 June 1329) was King of Scotland from 1306 to 1329. He is bett',
 'Robert II Archbishop of Rouen (989–1037) and Count of Évreux. He was an important and influential me',
 'Alain Robert (born Robert Alain Philippe; 7 August 1962) is a French rock climber and urban climber.',
 '\n\nDeaths \n March 21 – Robert I, Duke of Burgundy',
 'Robert Todd Lincoln (August 1, 1843 – July 26, 1926) was an American lawyer and Secretary of War. He',
 'Robert Evans (June 29, 1930 – October 26, 2019) was an American movie producer and studio executive.',
 'A Fnord is the typographic representation of disinformation or useless information with the intent

In [6]:
generate_results("einstein")

['Albert Einstein Square  (, Kikar Albert Einstein) is a public square in Jerusalem, Israel. it is loc',
 'Albert Einstein (14 March 1879 – 18 April 1955) was a German-born American scientist. He worked on t',
 'Stewart Robert "Bob" Einstein (November 20, 1942 – January 2, 2019) was an American actor, comedy wr',
 'Meir Einstein (\u200e; 21 October 1951 – 23 March 2017) was an Israeli sports broadcaster. He was born\xa0in',
 'Elsa Einstein, (18 January 1876 - 20 December 1936), was the second wife of the German phyicist, Alb',
 'Arieh "Arik" Einstein (January 3, 1939 – November 26, 2013) was a popular Israel singer, songwriter,',
 'Hendrik Antoon Lorentz (18 July 1853 – 4 February 1928) was a Dutch physicist. In 1902, he was award',
 'Mileva Marić (1875-1948) was a Serbian physicist. She was one of the first women to study physics. S',
 'The EPR paradox is an early and strong criticism of quantum mechanics. Albert Einstein and his co-wo',
 'Space-time is a mathematical model that joins

In [7]:
generate_results("earth")

['Earth is the third planet of the solar system. It is the only planet known to have life on it. The E',
 'The Earth phase, Terra phase, Terrestrial phase or phase of the Earth, is the shape of the directly ',
 "All planets in our solar system follow an elliptical path. This path is known as an orbit. Earth's o",
 'The Blue Marble is a photograph of the Earth. It was made on December 2, 1972, by the crew of the Ap',
 'Earth is a city in Lamb County, Texas, United States. There were 1,109 people living in Earth as of ',
 'The origin of water on Earth is only partly known. Life as we know it depends on water, and most typ',
 'The idea of a flat Earth is that the surface of the Earth is flat (a plane). Belief in a flat Earth ',
 '"Earthrise" is a photo of the Earth from the Moon taken on the Apollo 8 mission. It was taken by ast',
 'The Flat Earth Society (or the International Flat Earth Research Society) is an organization claimin',
 'Young Earth creationism (YEC) is the pseudoscientific