## Wiki Indexer

In [336]:
import xml.sax
import re
import string
from nltk.corpus import stopwords 
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
import datetime
import sys
import os
files_to_index_at_a_time = 10000
print_bool =False
index_dictionary = {}
STOPWORDS = set(stopwords.words('english')) 
URL_STOP_WORDS = set(["http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect"])
EXTENDED_PUNCTUATIONS = set(list(string.punctuation) + ['\n', '\t', " "])
INT_DIGITS = set(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])

try:
    os.mkdir("tempind_")
except:
    pass
def cleanText(text):
    text = re.sub(r'<(.*?)>','',text) #Remove tags if any
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text, flags=re.MULTILINE) #Remove Url
    text = re.sub(r'{\|(.*?)\|}', '', text, flags=re.MULTILINE) #Remove CSS
    text = re.sub(r'\[\[file:(.*?)\]\]', '', text, flags=re.MULTILINE) #Remove File
    text = re.sub(r'[.,;_()"/\'=]', ' ', text, flags=re.MULTILINE) #Remove Punctuaion
    text = re.sub(r'[~`!@#$%&-^*+{\[}\]()":\|\\<>/?]', ' ', text, flags=re.MULTILINE)
    return " ".join(text.split())

def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def get_InfoBox_Category_Text(body_text):
    infoBox , category , body , links , references = [],[],[],[],[]
    all_lines = body_text.split('\n')
    len_all_lines = len(all_lines)
    i=0
    
    while i < len_all_lines:
        if "{{infobox" in all_lines[i]:
            open_curly_brackets = 0
            while i < len_all_lines:
                if "{{" in all_lines[i]:
                    new_opened = all_lines[i].count("{{")
                    open_curly_brackets += new_opened
                if "}}" in all_lines[i]:
                    new_closed = all_lines[i].count("}}")
                    open_curly_brackets -= new_closed
                if open_curly_brackets > 0:
                        splitted_first_line = all_lines[i].split("{{infobox");
                        if("{{infobox" in all_lines[i] and len(splitted_first_line) >= 2 and len(splitted_first_line[1])>0):
                            infoBox.append(splitted_first_line[1])
                        else :
                            infoBox.append(all_lines[i])
                else:
                    break
                i+=1
        elif "[[category:" in all_lines[i]:
            category_line_split = all_lines[i].split("[[category:")
            if(len(category_line_split)>1):
                category.append(category_line_split[1].split("]]")[0])
                category.append(' ')
        elif "== external links ==" in all_lines[i] or "==external links ==" in all_lines[i] or "== external links==" in all_lines[i] or "==external links==" in all_lines[i]:
            i+=1
            while i < len_all_lines:
                if "*[" in all_lines[i] or "* [" in all_lines[i]:
                    links.extend(all_lines[i].split(' '))
                    i+=1
                else:
                    break 
        elif "==references==" in all_lines[i] or "== references==" in all_lines[i] or "==references ==" in all_lines[i] or "== references ==" in all_lines[i]:
            open_curly_brackets = 0
            i+=1
            while i < len_all_lines:
                if "{{" in all_lines[i]:
                    new_opened = all_lines[i].count("{{")
                    open_curly_brackets += new_opened
                if "}}" in all_lines[i]:
                    new_closed = all_lines[i].count("}}")
                    open_curly_brackets -= new_closed
                if open_curly_brackets > 0:
                    if "{{vcite" not in all_lines[i] and "{{cite" not in all_lines[i] and "{{reflist" not in all_lines[i]:
                        references.append(all_lines[i])
                else:
                    break
                i+=1
        else:
            body.append(all_lines[i])
        i+=1
    return cleanText(''.join(infoBox)),cleanText(''.join(body)),cleanText(''.join(category)),cleanText(''.join(links)),cleanText(''.join(references))
# stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")
def write_to_index(filenum,index_dictionary):
    outF = open("tempind_/"+str(filenum)+".txt", "w")
    sorted_keys = sorted(index_dictionary.keys())
    for key in sorted_keys:
        outF.write(key+":"+process_line(key))
        outF.write("\n")
    outF.close()

All_documents_done = True
class Page:
    def __init__(self):
        self.title=""
        self.info = ""
        self.category = ""
        self.links = ""
        self.references = ""
        self.body = ""
        self.pid = -1
    def set_title(self,title):
        self.title = title
    def set_info_cat_links_ref_body(self,info,body,cat,links,ref):
        self.body = body
        self.info = info
        self.category = cat
        self.links = links
        self.references = ref
    def process(self):
        if print_bool:
            print("Page id ",self.pid)
            print("TITLE ",self.title)
            print("INFOBOX ",self.info)
            print("CAT ",self.category)
            print("LINKS ",self.links)
            print("REFERENCES ",self.references)
            print("BODY ",self.body)
            print("")
       
        self.Tokenize()
        self.stop_word_removal()
        self.Stemming()
        self.create_index()
    def Tokenize(self):
        self.title = self.title.split()
        self.info = self.info.split()
        self.category = self.category.split()
        self.links = self.links.split()
        self.references = self.references.split()
        self.body = self.body.split()
        
    def stop_word_removal(self):
        self.title = [x for x in self.title if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
        self.info = [x for x in self.info if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
        self.category = [x for x in self.category if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
        self.links = [x for x in self.links if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
        self.references = [x for x in self.references if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
        self.body = [x for x in self.body if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]

    def Stemming(self):
        self.title = [stemmer.stem(titl) for titl in self.title]
        self.body = [stemmer.stem(titl) for titl in self.body]
        self.references = [stemmer.stem(titl) for titl in self.references]
        self.links = [stemmer.stem(titl) for titl in self.links]
        self.category = [stemmer.stem(titl) for titl in self.category]
        self.info = [stemmer.stem(titl) for titl in self.info]

    def create_index(self):
        final_dictionary = {}
        dictionary_local = {}
        title_split = self.title
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " t"+str(dictionary_local[word])
        dictionary_local.clear()
        dictionary_local = {}
        title_split = self.body
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " b"+str(dictionary_local[word])
        dictionary_local.clear()
        dictionary_local = {}
        title_split = self.info
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " i"+str(dictionary_local[word])
        
        dictionary_local.clear()
        dictionary_local = {}
        title_split = self.category
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " c"+str(dictionary_local[word])
        dictionary_local.clear()
        dictionary_local = {}
        title_split = self.links
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " l"+str(dictionary_local[word])
        dictionary_local.clear()
        dictionary_local = {}
        title_split = self.references
        for word in title_split:
            if dictionary_local.get(word) is None:
                dictionary_local[word] = 0
            dictionary_local[word]+=1
        for word in dictionary_local:
            if final_dictionary.get(word) is None:
                final_dictionary[word]=""+str(self.pid)
            final_dictionary[word]+= " r"+str(dictionary_local[word])
        for word in final_dictionary:
            if index_dictionary.get(word) is None:
                index_dictionary[word] = []
            index_dictionary[word].append(final_dictionary[word])
        dictionary_local.clear()
        final_dictionary.clear()
def process_line(key):
    list_ = index_dictionary[key]
    starter=""
    final_result=""
    for sub_list in list_:
        sublist_split = sub_list.split(" ");
        final_result+=starter
        is_page_number = True
        for elem in sublist_split:
            if is_page_number:
                final_result+=elem+"-"
                is_page_number=False
            else:
                final_result+=elem
        starter="|"
    return final_result

page = Page()
title_pid=[]
filenm=1

def Kwaymerge():
    import heapq
    max_offset_file_size=10*1024*1024 #10 MB
    offset_file_size = 0
    dic_ = {}
    file_num=1
    heap = []
    import os
    num_files = len(os.listdir("tempind_"))
    while(file_num<=num_files):
        fp = open('tempind_/'+str(file_num)+'.txt','r+')
        heap.append((fp.readline().strip(),file_num))
        dic_[file_num]=fp
        file_num+=1
    heapq.heapify(heap)
    prev = "...."
    outF = open(index_folder_path+"/index1.txt", "w")
    outO = open(index_folder_path+"/offset1.txt", "w")
    outS = open(index_folder_path+"/secondary_index.txt", "w")
    First = True
    offset= 0 
    i_n = 2
    while(len(heap)>0):
        string = heap[0][0]
        stream = dic_[heap[0][1]]
        file_number = heap[0][1]
        if string=='':
            heapq.heappop(heap)
            os.remove('tempind_/'+str(file_number)+'.txt')
        else:
            heapq.heappop(heap)
            heapq.heappush(heap,(stream.readline().strip(),file_number))  
            if string.split(":")[0] == prev:
                outF.write("|"+string.split(":")[1])
                offset+=len("|"+string.split(":")[1])
            else:
                if(offset_file_size>max_offset_file_size):
                    prev = "...."
                    outF.close()
                    outO.close()
                    outF = open(index_folder_path+"/index"+str(i_n)+".txt", "w")
                    outO = open(index_folder_path+"/offset"+str(i_n)+".txt", "w")
                    i_n+=1
                    offset= 0 
                    offset_file_size=0
                    First = True
                if First:
                    outS.write(string.split(":")[0]+" "+str(i_n-1)+"\n")
                    First = False
                else:
                    offset+=1
                    outF.write("\n")                 
                prev = string.split(":")[0]
                outO.write(string.split(":")[0]+" "+str(offset)+"\n")
                offset_file_size+=len(string.split(":")[0]+" "+str(offset)+"\n")
                outF.write(string)
                offset += len(string)
    outF.close()
    outO.close()
    outS.close()
title_number=0
outF_title = open(index_folder_path+"/title"+str(title_number)+".txt", "w")
outF_offset = open(index_folder_path+"/offset_title"+str(title_number)+".txt", "w")
offset_title=0
class ParseHandler( xml.sax.ContentHandler ):
    def __init__(self):
        self.tag = ""
        self.title = ""
        self.body = ""
        self.page = False
    def startElement(self, tag, attributes):
        global All_documents_done
        self.tag = tag
        if self.tag == "page":
            self.page = True
            All_documents_done = False
            page.pid+=1            
    def endElement(self, tag):
        global filenm,All_documents_done,outF_title,title_number,offset_title,outF_offset
        if tag=="page" and (page.pid+1)%files_to_index_at_a_time==0:
            print(str(page.pid+1)+" articles processed")
            write_to_index(filenm,index_dictionary)
            index_dictionary.clear()
            filenm=filenm+1
            All_documents_done = True
        if tag == "page":
            self.page = False
        elif tag == "text":
            infobox , body , cat , links , ref = get_InfoBox_Category_Text(self.body.lower())
            page.set_info_cat_links_ref_body(infobox,body,cat,links,ref)
            page.process()
            self.body = ""
            
        elif tag == "title":
#             title_pid.append(self.title)
            if (page.pid)%files_to_index_at_a_time==0:
                outF_title.close()
                outF_offset.close()
                outF_offset = open(index_folder_path+"/offset_title"+str(title_number)+".txt", "w")
                outF_title = open(index_folder_path+"/title"+str(title_number)+".txt", "w")
                title_number+=1
                offset_title=0
            outF_offset.write(str(offset_title))
            outF_offset.write("\n")
            outF_title.write(self.title)
            outF_title.write("\n")
            offset_title+=len(self.title.encode('utf-8'))+1
            

            page.set_title(cleanText(''.join(self.title.lower())))
        
    def characters(self, content):
        if self.page == True:    
            if self.tag == "title":
                self.title = content
            elif self.tag == "text":
                self.body +=content
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = ParseHandler()
parser.setContentHandler( Handler )
start = datetime.datetime.now()
parser.parse("input_data/input.xml")
if not All_documents_done:
    write_to_index(filenm,index_dictionary)
    index_dictionary.clear()
if not outF_title.closed:
    outF_title.close()
    outF_offset.close()
index_folder_path = "index_folder"
if index_folder_path[len(index_folder_path)-1]=="/":
    index_folder_path = index_folder_path[:-1]
    
print()
print("K - way Merging Start")
print()
Kwaymerge()

print()
print("K - way Merging End")
print()
end = datetime.datetime.now()
secs  = (end-start).seconds
hr = int(secs/(60*60))
rm = int(secs%(60*60))
mn = int(rm/60)
rm=int(rm%60)
secs = int(rm)
print("Indexing Time : ",hr," hrs ",mn," mns",secs," secs")
print("Total Articles : "+str(page.pid+1))

10000 articles processed

K - way Merging Start


K - way Merging End

Indexing Time :  0  hrs  14  mns 50  secs
Total Articles : 19819


In [201]:
print("Total Articles : "+str(page.pid+1))

Total Articles : 19819


In [303]:
data = 'Jagiellonian University in Kraków'
len(data)
# data = data.encode('utf-8')
# data.decode('utf-8')

33

In [107]:
from math import log
import sys
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
import re
STOPWORDS = set(stopwords.words('english')) 
URL_STOP_WORDS = set(["http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect"])
Pstemmer = SnowballStemmer("english")
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
class Query:
    def __init__(self,query):
        self.query = query
    def Tokenize(self):
        self.query = self.query.split()
    def lower(self):
        self.query = self.query.lower()
    def stop_word_removal(self):
        self.query = [x for x in self.query if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
    def Stemming(self):
        self.query = [Pstemmer.stem(titl) for titl in self.query]
    def process(self):
        self.lower()
        self.Tokenize()
        self.stop_word_removal()
        self.Stemming()
    def value(self):
        return self.query
def title_for_docs(doc_ids,title_pid,k):
    title = []
    count = 0
    for doc_id in doc_ids:
        title.append(title_pid[int(doc_id)])
        count+=1
        if count==k:
            break
    return title
def Search(query):
    Q = Query(query)
    Q.process()
    query =  Q.value()
    all_word_docs =[]
    for word in query:
        if word not in search_dictionary:
            continue
        line = search_dictionary[word].split("|")
        docs=set()
        for doc in line:
            docs.add(doc.split("-")[0])
        if(len(docs)>0):
            all_word_docs.append(docs)
    return sorted_results(all_word_docs)
def sorted_results(all_word_docs):
    if len(all_word_docs)==0:
        return []
    all_ = set.intersection(*all_word_docs)    
    union = set.union(*all_word_docs)
    diff = set.difference(union,all_)
    all_=list(all_) + list(diff)
    return all_
def load_index_dictionary(path_to_index_folder):
    dictionary_search = {}
    fp = open(path_to_index_folder+"/indexfile.txt")
    for i, line in enumerate(fp):#enumerate dont load whole in memory
        word , rest = line.split(":")[0],line.split(":")[1][:-1]
        dictionary_search[word] = rest
    fp.close()
    return dictionary_search
def load_titles(path_to_index_folder):
    titles = []
    fp = open(path_to_index_folder+"/title.txt")
    for i, line in enumerate(fp):#enumerate dont load whole in memory
        titles.append(line[:-1])
    fp.close()
    return titles

def read_file(testfile):
    with open(testfile, 'r') as file:
        queries = file.readlines()
    return queries


def write_file(outputs, path_to_output):
    '''outputs should be a list of lists.
        len(outputs) = number of queries
        Each element in outputs should be a list of titles corresponding to a particular query.'''
    with open(path_to_output, 'w') as file:
        for output in outputs:
            for line in output:
                file.write(line.strip() + '\n')
            file.write('\n')


def search_help(path_to_index, queries):
    '''Write your code here'''
    title_pid = load_titles(path_to_index)
    result = []
    for query in queries:
        if ":" in query:
            all_word_docs = Search2(query,search_dic)
        else:
            all_word_docs = Search(query)
        result.append(title_for_docs(all_word_docs,title_pid,10))
    return result

def get_field_list(query):
    query = query.replace("body:","b:").replace("title:","t:").replace("category:","c:").replace("infobox:","i:").replace("ref:","e")
    words = query.split(" ")
    dictionary_query = {}
    field = ""
    for word in words:
        if re.search(r'[t|b|c|e|i]{1,}:', word):
            field = word.split(':')[0]
            word = word.split(':')[1]
        if field not in dictionary_query.keys():
            dictionary_query[field] = []
        dictionary_query[field].append(word)
    return dictionary_query
def page_number_for_field(search_dic,word,field_type):
    result= []
    if word not in search_dic:
        return result
    lst = search_dic[word].split("|")
    for l in lst:
        if field_type in l:
            result.append(l.split("-")[0])
    return result
def Search2(query,search_dic):
    query = query.lower()
    field_dict = get_field_list(query)
    field_results = []
    for key in field_dict.keys():
        key_list = []
        lst = field_dict[key]
        for word in lst:
            word = Pstemmer.stem(word)
            key_list = key_list + page_number_for_field(search_dic,word,key)
        field_results.append(set(key_list))
    return set.union(*field_results)

def main():
    path_to_index_folder = sys_argv[1]
    if path_to_index_folder[len(path_to_index_folder)-1]=="/":
        path_to_index_folder = path_to_index_folder[:-1]
    testfile = sys_argv[2]
    path_to_output = sys_argv[3]
    queries = read_file(testfile)
    outputs = search_help(path_to_index_folder, queries)
    write_file(outputs, path_to_output)

# sys_argv=["ss","index_folder","sample_queries.txt","resultslog.txt"]
# main()

In [64]:
import bisect
def lower_bound(list_,word):
    i = bisect.bisect_left(list_,word)
    if(i<len(list_) and list_[i] == word):
        return i
    else:
        return i-1
def BinarySearch(list_,word): 
    i = bisect.bisect_left(list_,word) 
    if i != len(list_) and list_[i] == word: 
        return i 
    else: 
        return -1

In [342]:
from math import log
import sys
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
import re
STOPWORDS = set(stopwords.words('english')) 
URL_STOP_WORDS = set(["http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect"])
Pstemmer = SnowballStemmer("english")
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
class Query:
    def __init__(self,query):
        self.query = query
    def Tokenize(self):
        self.query = self.query.split()
    def lower(self):
        self.query = self.query.lower()
    def stop_word_removal(self):
        self.query = [x for x in self.query if x not in STOPWORDS and x not in URL_STOP_WORDS and isEnglish(x)]
    def Stemming(self):
        self.query = [Pstemmer.stem(titl) for titl in self.query]
    def process(self):
        self.lower()
        self.Tokenize()
        self.stop_word_removal()
        self.Stemming()
    def value(self):
        return self.query
    
secondary_list=[]
with open(index_folder_path+'/secondary_index.txt') as f:
    secondary_list= f.read().splitlines() 
    
def get_posting_list(word):
    global secondary_list
    posting_list =  ""
    offset_file_num = lower_bound(secondary_list,word)
    if offset_file_num !=-1:
        fp = open(index_folder_path+"/offset"+str(offset_file_num+1)+".txt")
        dict_ = {}
        while True:
            string_ = fp.readline().strip();
            if string_=='':
                break;
            dict_[string_.split(" ")[0]]=int(string_.split(" ")[1])
        fp.close()
        fp = open(index_folder_path+"/index"+str(offset_file_num+1)+".txt")
        if word not in dict_:
            return posting_list
        fp.seek(dict_[word])
        posting_list = fp.readline().strip().split(":")[1]
        fp.close()
    return posting_list
def process_posting_field_tf(posting,field):
    all_parts = posting.split("|")
    dict_={}
    for part in all_parts:
        doc = int(part.split("-")[0])
        stng = r""+str(field)+'\d*'
        pattern = re.findall(stng,part.split("-")[1])
        sum_=0
        if len(pattern)>0:
            sum_ = int(pattern[0][1:])
            dict_[doc]=sum_
    return dict_
def process_posting_idf(posting):
    Total_documents = page.pid+1
    return 1.0 + log(float(Total_documents) / len(posting.split("|")))
def process_posting_normal_tf(posting):
    all_parts = posting.split("|")
    dict_={}
    for part in all_parts:
        doc = int(part.split("-")[0])
        pattern = re.findall(r'[a-z]\d*',part.split("-")[1])
        sum_=0
        for p in pattern:
            sum_+=int(p[1:])
        dict_[doc]=sum_
    return dict_
def calculate_tf_idf_of_docs_normal(query):
    Q = Query(query)
    Q.process()
    query_parts =  Q.value()
    docs = {}
    for query_part in query_parts:
        posting = get_posting_list(query_part)
        if len(posting)<=0:
            continue
        dict_ = process_posting_normal_tf(posting)
        idf = process_posting_idf(posting)
        for key in dict_.keys():
            try:
                docs[key]+=log(1+dict_[key])*idf
            except:
                docs[key]=log(1+dict_[key])*idf
    return docs
def calculate_tf_idf_of_docs_field(query):
    query = query.lower()
    field_dict = get_field_list(query)
    field_results = []
    docs = {}
    for key in field_dict.keys():
        key_list = []
        lst = field_dict[key]
        for word in lst:
            word = Pstemmer.stem(word)
            posting = get_posting_list(word)
            if(len(posting)<=0):
                continue
            dic_ = process_posting_field_tf(posting,key)
            idf = process_posting_idf(posting)
            for key_ in dic_.keys():
                try:
                    docs[key_]+=log(1+dict_[key])*idf
                except:
                    docs[key_]=log(1+dict_[key])*idf
    return docs

def get_field_list(query):
    query = query.replace("body:","b:").replace("title:","t:").replace("category:","c:").replace("infobox:","i:").replace("ref:","e")
    words = query.split(" ")
    dictionary_query = {}
    field = ""
    for word in words:
        if re.search(r'[t|b|c|e|i]{1,}:', word):
            field = word.split(':')[0]
            word = word.split(':')[1]
        if field not in dictionary_query.keys():
            dictionary_query[field] = []
        dictionary_query[field].append(word)
    return dictionary_query

def get_title_of_doc(doc_id):
    file_num = int(doc_id/files_to_index_at_a_time)
    line_num = int(doc_id%files_to_index_at_a_time)
    with open(index_folder_path+"/offset_title"+str(file_num)+".txt") as f:
        mylist = f.read().splitlines()    
    title_file = open(index_folder_path+"/title"+str(file_num)+".txt",'r')
    title_file.seek(int(mylist[line_num]))
    result  = title_file.readline().strip()

    title_file.close()
    return result

def get_titles(doc_ids):
    titles=[]
    for doc_id in doc_ids:
        if(len(titles)>10):
            break
        titles.append(get_title_of_doc(doc_id))
    return titles
def search_helper(query):
    dict_={}
    if ":" in query:
        dict_ = calculate_tf_idf_of_docs_field(query)
    else:
        dict_ = calculate_tf_idf_of_docs_normal(query)
    sorted_x = sorted(dict_.items(), key=lambda kv: kv[1],reverse=True)
    return get_titles([a[0] for a in sorted_x])
    
search_helper("yunost")

['Avicenna', 'Bandy']

In [None]:
2738 John Calvin
2750 Jagiellonian University in Kraków

In [340]:
get_title_of_doc(420)

0 420 6787


'Avicenna'

In [337]:
title_file = open("/home/danish/Desktop/testfile.txt",'r')
title_file.seek(0)
result  = title_file.readline()
title_file.close()
print(len(result))

34


In [326]:
s = 'Jagiellonian University in Kraków'

In [327]:
s = s.encode('utf-8')
# s.decode('utf)

In [331]:
b = s.decode('utf-8')
len(b)

33