# Search engine Exercise 2

In [1]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
import re
from natsort import natsorted

# Creating the Search Engine 

First of all lets gather all the documents in one list.
* documents: lists of document. Each document correspond to an anime description.

In [2]:
documents = []

folder = r"./tsv_anime/"
for anime in tqdm(natsorted(os.listdir(folder))):
    df = pd.read_csv(folder+anime, sep = "\t")
    documents.append(df["animeDescription"][0])

100%|██████████████████████████████████████████████████████████████████████████████| 6500/6500 [01:15<00:00, 86.02it/s]


In [3]:
# view a document 
documents[1]



## Clean the documents

Now let's cleaning all the documents. This step is colled preprocessing. We follow this order:
- 1) expand contraction form + Normalization (capital lower words)
- 2) remove number from text. We want only text string 
- 3) Tokanize. We divide the string in words.
- 4) removing stopwords
- 5) removing punctuation
- 6) removing some other words or non-text string
- 7) stemming 

Let's inspect the document to see what we can delete and what no:
- for example: "Philosopher's Stone—a powerful" << This is dash
- but "bio-mechanical engineering" << This is hyphen

We decide to keep the hypen and remove the dash

Also we encounter a lot of contraction form: using wikipedia https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
we store them in a dictionary and restore the long form.

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [5]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he shall",
"he'll've": "he shall have",
"he's": "he has",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has",
"i'd": "I had",
"i'd've": "I would have",
"i'll": "I shall",
"i'll've": "I shall have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it shall",
"it'll've": "it shall have",
"it's": "it has",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she shall",
"she'll've": "she shall have",
"she's": "she has",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that has",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there has",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they shall",
"they'll've": "they shall have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall",
"what'll've": "what shall have",
"what're": "what are",
"what's": "what has",
"what've": "what have",
"when's": "when has",
"when've": "when have",
"where'd": "where did",
"where's": "where has",
"where've": "where have",
"who'll": "who shall",
"who'll've": "who shall have",
"who's": "who has",
"who've": "who have",
"why's": "why has",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you shall",
"you'll've": "you shall have",
"you're": "you are",
"you've": "you have"
}

def replace_words(d, contractions):
    for key, value in contractions.items():
        d = d.replace(key, value)
    return d

In [173]:
def pre_processing(documents):
    stop = stopwords.words("english")
    snowball_stemmer = SnowballStemmer("english")
    lmtzr = WordNetLemmatizer()
    remove = ["Written", "MAL", "Rewrite"]+["'s"]+["``",'"',"'","“","''"]

    
    # removing contraction + Normalization
    document_tmp = replace_words(documents.lower(), contractions)
    # remove number ( ordinal number too) 
    document_tmp = re.sub(r'[0-9]+(?:st|nd|rd|th)', '', document_tmp)
    # remove dash "—"
    document_tmp = document_tmp.replace('—',' ')
    # Tokenizing 
    document_tmp =  word_tokenize(document_tmp) 
    # removing stopwords
    document_tmp = [ word for word in document_tmp if word not in stop]
    # removing punctuation
    document_tmp = [ word for word in document_tmp if word not in string.punctuation]
    # removing "Written MAL Rewrite" and other stuff
    document_tmp = [ word for word in document_tmp if word not in remove]
    # lemmatize
    document_tmp = [ lmtzr.lemmatize(word) for word in document_tmp]
    # stemming 
    document_tmp = [ snowball_stemmer.stem(word) for word in document_tmp]

    
    return document_tmp

- documents_clean: is a list of list of the documents cleaned. Each list contain the tokenize cleaning document text.

In [174]:
# cleaning the documents
documents_clean = []
for d in documents:
    documents_clean.append(pre_processing(d))

Let's view how a document is processed:

In [175]:
documents_clean[0][:10]

['horrif',
 'alchemi',
 'experi',
 'go',
 'wrong',
 'elric',
 'household',
 'brother',
 'edward',
 'alphons']

# Creating vocabulary

In [176]:
import itertools
import numpy as np

I will create a list of each unique word among the all documents

In [177]:
# the list of all words
word_list = list(set(list(itertools.chain.from_iterable(documents_clean))))

Creating a dictionary that maps each word to an integer

In [178]:
vocabolary = dict(zip(word_list, range(len(word_list))))

In [179]:
# view the vocabolary
count = 0
for key, mapped_int in vocabolary.items():
    count +=1
    print(key,"-->",mapped_int)
    if count == 10: break

forti --> 0
fabiola --> 1
gamer --> 2
alternate-histori --> 3
savor --> 4
blue-hair --> 5
2002 --> 6
sakuranomori --> 7
limited-edit --> 8
morimoto --> 9


## Search Engine v1 Conjunctive query

For this type of search engine we need only to have a search engine based on the query appear or not in each documents. 


* ### Prepare the mapped document

To do this we will create an array of documents of each len(document_j) in which thare are converted the word into integer based on the vocabolary

We will use numpy array for time optimitation.

In [180]:
# function that map document text to integer

def word_to_int(document, vocabolary):
    int_doc = np.zeros(len(document))
    # iterating over the document that has len(d)<<len(vocabolary)
    # change the value of the document, otherwise remain zero
    for i, word in enumerate(document):
        # vocabolary[word] is the mapping function that return an integer i.e the index
        int_doc[i] = vocabolary[word]
        
    return np.sort(int_doc)

* documents_mapped: is a list of list that have the words mapped

In [181]:
documents_mapped = []
for d in documents_clean:
    documents_mapped.append(word_to_int(d,vocabolary))

In [182]:
# view a doc
documents_mapped[0]

array([  297.,   388.,   388.,   388.,   526.,   620.,   664.,   769.,
         769.,   857.,   904.,  1269.,  1937.,  2145.,  2351.,  2776.,
        2815.,  3174.,  3230.,  3232.,  3252.,  3362.,  3593.,  3738.,
        3850.,  3881.,  3977.,  3983.,  4135.,  4161.,  4455.,  4455.,
        4610.,  5204.,  5993.,  5993.,  6115.,  6361.,  6425.,  7445.,
        7480.,  8026.,  8400.,  8916.,  9194.,  9580.,  9718.,  9718.,
        9718.,  9718.,  9997., 10103., 10142., 10181., 10333., 10928.,
       10928., 11321., 11634., 12500., 12729., 12729., 12729., 12966.,
       13126., 13385., 13521., 13605., 13792., 13842., 13842., 13853.,
       14284., 14444., 14758., 15035., 15073., 15078., 15218., 15768.,
       16319., 16691., 16988., 17068., 17239., 17875., 18036., 18292.,
       18342., 19062., 19077., 19613., 19613., 20114., 20390., 20466.,
       20613., 21175., 21391., 21811., 22554., 22746., 23379., 24180.,
       24623., 24843., 25170., 25587., 25723., 25798., 25898., 25903.,
      

* ### Inverted Index v1

In [183]:
from collections import defaultdict  

In [184]:
# initialize the Inverted_index
Inverted_index = defaultdict(list)

To compute the Inverted Index we iterating over each document. Every time we encounter a word (that is now a integer) we insert in the dictionary the id of the documents, which is the row index in documents_mapped or in the dataframe.

In [185]:
for i,d in enumerate(documents_mapped):
    for word in set(d):
        Inverted_index[word].append(i)

In [186]:
count = 0
for key, lis in Inverted_index.items():
    count +=1
    print(key,"-->", lis)
    if count == 1: break

4610.0 --> [0, 22, 36, 128, 204, 223, 331, 352, 407, 423, 483, 525, 558, 608, 641, 689, 768, 956, 1057, 1218, 1227, 1493, 1560, 1627, 1630, 1647, 1783, 1892, 2010, 2038, 2074, 2116, 2148, 2232, 2357, 2429, 2442, 2715, 2760, 2782, 3008, 3034, 3239, 3261, 3401, 3447, 3605, 3692, 3796, 3857, 3985, 4513, 4620, 4678, 4683, 4768, 4872, 4892, 4911, 4984, 5222, 5232, 5344, 5484, 5608, 5831, 6363, 6462]


Saving the Inverted Index in memory

In [187]:
import json

file = open("Inverted_index_v1.json", "w", encoding = "utf-8")
json.dump(Inverted_index, file)
file.close()

* ### Searching

Function for the search engine: the step by step are explain after

In [480]:
def search_engine_v1(query_text):
    # pre-processing the query
    query_clean = pre_processing(query_text)
    query_int = word_to_int(query_clean, vocabolary)
    
    # finding the anime index 
    index = []
    
    for query in query_int:
        # creating a set of index (set is for intersection pourpose)
        index.append(set(Inverted_index[query]))
    
    index = list(index[0].intersection(*index))
    
    # search the url
    with open("./anime_url.txt", "r", encoding = "utf-8") as f:
        lines = f.readlines()
        
    # we are searching for the anime and the url 
    anime_path = []
    url = []

    for idx in index:
        # we need the +1 because we start indexing from 1
        name = "/anime_"+str(idx+1)+".tsv"
        anime_path.append(name)
        url.append(lines[idx])

    # creating the datafrae for view the result
    animes_df = []
    # folder of the anime_tsv
    folder = r"./tsv_anime/"
    # column I want
    cols = ["animeTitle","animeDescription"]
    for i,anime_tsv in enumerate(anime_path):
        df = pd.read_csv(folder+anime_tsv, sep = "\t", usecols = cols)
        # creating new column
        df["animeURL"] = url[i]
        animes_df.append(df)
    
    frame = pd.concat(animes_df, ignore_index = True)
    display(frame)
    # delate the dataframe and some list to memory space
    #del(frame)
    #del(lines)

In [481]:
# input query
query_text = input()

search_engine_v1(query_text)

 alchemy alchemist


Unnamed: 0,animeTitle,animeDescription,animeURL
0,Fullmetal Alchemist: Brotherhood,After a horrific alchemy experiment goes wrong...,https://myanimelist.net/anime/5114/Fullmetal_A...
1,Fullmetal Alchemist,"Edward Elric, a young, brilliant alchemist, ha...",https://myanimelist.net/anime/121/Fullmetal_Al...
2,Fullmetal Alchemist: Brotherhood Specials,Amazing secrets and startling facts are expose...,https://myanimelist.net/anime/6421/Fullmetal_A...
3,Baccano!,"During the early 1930s in Chicago, the transco...",https://myanimelist.net/anime/2251/Baccano\n


* ### Step by step function

In [None]:
# pre-processing the query
query_clean = pre_processing(query_text)
query_int = word_to_int(query_clean, vocabolary)

In [None]:
query_int

We create a list named index that store a list of list, each one is the output from the inverted_index corrisponding to a query element. Than we intersect for obtain the documents that match ALL the query elements.

In [None]:
index = []
for query in query_int:
    index.append(set(Inverted_index[query]))

index = list(index[0].intersection(*index))
print(index)

Output

In [None]:
# search the url
with open("./anime_url.txt", "r", encoding = "utf-8") as f:
    lines = f.readlines()

In [None]:
# we are searching for the anime and the url 
anime_path = []
url = []

for idx in index:
    # we need the +1 because we start indexing from 1
    name = "/anime_"+str(idx+1)+".tsv"
    anime_path.append(name)
    url.append(lines[idx])

# creating the datafrae for view the result
animes_df = []
# folder of the anime_tsv
folder = r"./tsv_anime/"
# column I want
cols = ["animeTitle","animeDescription"]
for i,anime_tsv in enumerate(anime_path):
    df = pd.read_csv(folder+anime_tsv, sep = "\t", usecols = cols)
    # creating new column
    df["animeURL"] = url[i]
    animes_df.append(df)
    
frame = pd.concat(animes_df, ignore_index = True)
display(frame)

# Searching Engine v2 Ranking

Now we want for our Inverted_index two element:

* $\text{tf}_{i,j}$: occurancy of term $j$ in document $i$
* $\text{idf}_{j}$: Inverse Document Frequency of term $j$

Define:

* n_words = total number of words in vocabolary
* n = number of documents

In [190]:
n = len(documents)
n_words = len(vocabolary)

Creating the $\text{tf}_{i,j}$ matrix:

* ### Prepare the mapped document

To do this we will create an array of documents of each len(document_j) in which thare are converted the word into integer based on the vocabolary

We will use numpy array for time optimitation.

In [191]:
# function that map document text to integer

def word_to_int2(document, vocabolary):
    int_doc = np.zeros(len(vocabolary))
    # iterating over the document that has len(d)<<len(vocabolary)
    # change the value of the document, otherwise remain zero
    for i, word in enumerate(document):
        # vocabolary[word] is the mapping function that return an integer i.e the index
        int_doc[int(vocabolary[word])] += 1
        
    return int_doc

* documents_mapped: is a list of list that have the words mapped with is count é I have the tf

In [192]:
documents_mappedv2 = []
for d in documents_clean:
    documents_mappedv2.append(word_to_int2(d,vocabolary))

In [193]:
# view a doc
documents_mappedv2[0][documents_mappedv2[0]>0]

array([1., 3., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 4., 1., 1., 1., 1., 1., 2., 1., 1., 1.,
       3., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

Creating the $\text{idf}_{j}$ array:
* I need the $n_j$: number of documents containing term j

In [194]:
nj = np.zeros(n_words, dtype = np.int64)

for d in documents_mapped:
    for word in set(d):
        nj[int(word)] += 1

In [195]:
# view
nj

array([ 3,  1, 14, ..., 30,  3,  1], dtype=int64)

In [196]:
tf = documents_mappedv2

In [197]:
# creating idf_j

idf = np.zeros(n_words, dtype = np.float64)
idf = np.log(n/nj) / np.log(n)
    

In [198]:
tfIdf = np.multiply(tf,idf)

In [199]:
tfIdf.shape

(6500, 26282)

* ### Inverted indexi v2

In [200]:
from collections import defaultdict  

In [201]:
Inverted_indexv2 = defaultdict(list)

To compute the Inverted Index we iterating over each document. Every time we encounter a word (that is now a integer) we insert in the dictionary the __tupla__ of _id_ of the documents and the _tfIdf_.

In [202]:
for i,d in enumerate(documents_mapped):
    for word in set(d):
        Inverted_indexv2[int(word)].append((i,tfIdf[i,int(word)]))

In [203]:
# view
count = 0
for key, lis in Inverted_indexv2.items():
    count +=1
    print(key,"-->",lis)
    if count == 1: break

4610 --> [(0, 0.5193940325148905), (22, 0.5193940325148905), (36, 0.5193940325148905), (128, 0.5193940325148905), (204, 0.5193940325148905), (223, 0.5193940325148905), (331, 0.5193940325148905), (352, 0.5193940325148905), (407, 0.5193940325148905), (423, 0.5193940325148905), (483, 0.5193940325148905), (525, 0.5193940325148905), (558, 0.5193940325148905), (608, 0.5193940325148905), (641, 0.5193940325148905), (689, 0.5193940325148905), (768, 0.5193940325148905), (956, 0.5193940325148905), (1057, 0.5193940325148905), (1218, 0.5193940325148905), (1227, 0.5193940325148905), (1493, 1.038788065029781), (1560, 0.5193940325148905), (1627, 1.038788065029781), (1630, 0.5193940325148905), (1647, 0.5193940325148905), (1783, 0.5193940325148905), (1892, 0.5193940325148905), (2010, 0.5193940325148905), (2038, 0.5193940325148905), (2074, 0.5193940325148905), (2116, 0.5193940325148905), (2148, 0.5193940325148905), (2232, 0.5193940325148905), (2357, 0.5193940325148905), (2429, 0.5193940325148905), (2442,

Saving the Inverted Index in memory

In [545]:
import json

file = open("Inverted_index_v2.json", "w")
json.dump(Inverted_indexv2, file)
file.close()

* ### Searching

In [546]:
# input query
query_text = input()

 alchemy alchemist


In [547]:
query_clean = pre_processing(query_text)
query_int = word_to_int2(query_clean, vocabolary)

In [548]:
query_int

array([0., 0., 0., ..., 0., 0., 0.])

We define the cosine similarity as:
$$
\begin{equation}
\cos(q,d^i) =  \frac{\sum_{j=1}^d q_j*d_{ij}}{||d^i||*||q||}
\end{equation}
$$
Recall: $d^i = [\text{tfIdf}_{i1}, \text{tfIdf}_{i2}, \ldots]$ 

In [549]:
def cosine_sim(a,b):
    cosine = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cosine

In [550]:
np.where(query_int>0)[0]

array([ 5993, 16691], dtype=int64)

Create the list of each match 

In [567]:
match_list = []
lenMatch = []
# tupla = (#list, len(list))
max_lenMatch = (0,-1)
for i,query in enumerate(np.where(query_int>0)[0]):
    lis = Inverted_indexv2[int(query)]
    match_list.append(lis)
    tmplis, tmplen = i, len(lis)
    if tmplen>max_lenMatch[1]:
        max_lenMatch = (tmplis,tmplen)
    lenMatch.append(len(lis))

In [568]:
def minimum(lists, endlist):
    # get the minimum and the argmin
    #idd = [idd[0] for idd in lists]
    lis = []
    for i,idd in enumerate(lists):
        # be care to take the minimum for only the "still running" pointer
        if i in np.where(endlist == 0)[0]:
            lis.append(idd[0])
    minimum = min(lis)
    
    return minimum

In [569]:
def increase_pointer(match_list, pointer, minimum, lenMatch, endlist):
    for i in range(len(pointer)):
        if  pointer[i] == lenMatch[i]-1:
            # endlist is a list of 0 and 1
            # 0 if the list is still running, 1 if finished
            endlist[i] = 1
        # increase the pointer if I score the min BUT take care of len
        target = match_list[i][0]
        if  target == minimum and pointer[i] < lenMatch[i]-1:
            pointer[i] += 1
    return pointer, endlist

In [570]:
# searching engine:

# initialize 
m = len(query_clean)
pointer = np.zeros(m, dtype = "int")
scores = []
endlist = np.zeros(m, dtype = "int")
# while loop untile escape the maximum list: I want all the score at least 1 match
end = True
while(end):
    # get the list of element pointed by the pointer
    lis = []
    for i in range(m):
        lis.append(match_list[i][pointer[i]])
    # get the minimum and the argmin
    mini = minimum(lis,endlist)
    # compute the score:
    score = cosine_sim(query_int, tfIdf[mini])
    # heappush will heap by first element
    # !!! -score because I heap sort for min value
    heapq.heappush(scores,(-score, mini))
    # increase the pointer
    pointer, endlist = increase_pointer(lis, pointer, mini, lenMatch, endlist)  
    # when finishing the list, escape
    if endlist.all() == 1:
        end = False

In [575]:
k = 10
topscore = []
topk = []
for i in range(k):
    topscoretmp, topktmp = heapq.heappop(scores)
    topscore.append(abs(topscoretmp)), topk.append(topktmp)

In [576]:
 # search the url
with open("./anime_url.txt", "r", encoding = "utf-8") as f:
    lines = f.readlines()
        
# we are searching for the anime and the url 
anime_path = []
url = []

for idx in topk:
    # we need the +1 because we start indexing from 1
    name = "/anime_"+str(idx+1)+".tsv"
    anime_path.append(name)
    url.append(lines[idx])

# creating the datafrae for view the result
animes_df = []
# folder of the anime_tsv
folder = r"./tsv_anime/"
# column I want
cols = ["animeTitle","animeDescription"]
for i,anime_tsv in enumerate(anime_path):
    df = pd.read_csv(folder+anime_tsv, sep = "\t", usecols = cols)
    # creating new column
    df["animeURL"] = url[i]
    df["animeScores"] = topscore[i]
    animes_df.append(df)
    
frame = pd.concat(animes_df, ignore_index = True)
display(frame)

Unnamed: 0,animeTitle,animeDescription,animeURL,animeScores
0,Arcana Famiglia: Capriccio - stile Arcana Fami...,"After toiling away in his lab, the alchemist J...",https://myanimelist.net/anime/15411/Arcana_Fam...,0.114412
1,Fullmetal Alchemist: Premium Collection,1. State Alchemists vs Seven Homunculi\n\nA 10...,https://myanimelist.net/anime/908/Fullmetal_Al...,0.097918
2,Birthday Wonderland,"The day before her birthday, Akane is asked to...",https://myanimelist.net/anime/38985/Birthday_W...,0.086791
3,Gosick,Kazuya Kujou is a foreign student at Saint Mar...,https://myanimelist.net/anime/8425/Gosick\n,0.081709
4,Trinity Seven Movie 1: Eternity Library to Alc...,The film's story begins when Arata inadvertent...,https://myanimelist.net/anime/33581/Trinity_Se...,0.081366
5,Senki Zesshou Symphogear GX,Following the events of Senki Zesshou Symphoge...,https://myanimelist.net/anime/21573/Senki_Zess...,0.078723
6,Fullmetal Alchemist: The Sacred Star of Milos,Chasing a runaway alchemist with strange power...,https://myanimelist.net/anime/9135/Fullmetal_A...,0.076657
7,Garo: Vanishing Line,Corruption looms over the prosperous Russell C...,https://myanimelist.net/anime/36144/Garo__Vani...,0.071497
8,Senki Zesshou Symphogear AXZ,Hibiki Tachibana has defeated many powerful en...,https://myanimelist.net/anime/32836/Senki_Zess...,0.063786
9,Garo: Honoo no Kokuin,"In the name of the king, the Valiante Kingdom ...",https://myanimelist.net/anime/23311/Garo__Hono...,0.057814


# Cheak intersection

In [493]:
index1 = []
for x in match_list:
    tmp = []
    for tupla in x:
        tmp.append((tupla[0]))
    index1.append(set(tmp))

In [495]:
indexxx = list(index1[0].intersection(*index1))

In [497]:
indexxx

[525, 167]

In [505]:
topk

[6270, 393, 525, 1490, 167, 0, 2184, 2420, 4155, 4465]