# Self Study 3

This self study concludes our first "miniproject" on crawling and search. The tasks for this self study are:
- modify/extend the inverted index you constructed in the previous self study to contain for all postings the term frequencies (if your documents are just the titles of the web pages, you will see very few term frequencies larger than 1, but do not worry about that).
- calculate the idf values for all terms, and also include them in your index (cf. slide 3.20 for a schematic view)
- implement ranked retrieval as described on slides 3.19 and 3.20 for the ntc.bnc similarity metric 

In [8]:
import requests
from bs4 import BeautifulSoup
from crawler import crawl

In [9]:
import logging

logging.basicConfig(
    level=logging.ERROR, 
    format='%(asctime)s (%(name)s) %(levelname)s: %(message)s', 
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [10]:
[visited, _, _] = crawl(["https://notes.bagerbach.com"], timeout=2, host_blacklist=[])
[x for i, x in enumerate(visited.items()) if i < 5]



SSLError: HTTPSConnectionPool(host='rch.ac.ir', port=443): Max retries exceeded with url: /article/Details/7731 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)')))

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from typing import Dict, List
from nltk.stem.porter import PorterStemmer

vocabulary: Dict[str, List] = {}
stemmer = PorterStemmer()

for url, title in visited.items():
    for pos, token in enumerate(nltk.word_tokenize(title)):
        stemmed = stemmer.stem(token)
        if stemmed not in vocabulary:
            vocabulary[stemmed] = []
        vocabulary[stemmed].append((pos, url))
        
# Vocabulary is a dictionary of tokens and their positions in the titles.
# So the variable is a dict whose keys make up the vocabulary. It isn't exactly a vocabulary. More like an inverted index.
vocabulary

In [12]:
# Calculate term frequency
inverted_index = vocabulary

for key in inverted_index:
    postings = inverted_index[key]
    inverted_index[key] = {
        "postings": postings,
        "frequency": len(postings),
    }

inverted_index

{'home': {'postings': [(0, 'https://notes.bagerbach.com'),
   (2, 'https://www.wikiwand.com/')],
  'frequency': 2},
 '|': {'postings': [(1, 'https://notes.bagerbach.com'),
   (1, 'https://bagerbach.com/books'),
   (3, 'https://notes.bagerbach.com/the-barbell-strategy'),
   (9, 'https://bagerbach.com/books/the-go-giver'),
   (18, 'https://bagerbach.com/books/why-we-get-fat-and-what-to-do-about-it'),
   (8, 'https://notes.bagerbach.com/pp-6-declaring-types-and-type-classes'),
   (12, 'https://bagerbach.com/books/a-brief-history-of-time'),
   (2, 'https://notes.bagerbach.com/innovation-multiplier'),
   (7,
    'https://notes.bagerbach.com/increase-your-leverage-to-become-more-productive'),
   (13, 'https://bagerbach.com/books/the-richest-man-in-babylon'),
   (2, 'https://notes.bagerbach.com/zero-sum'),
   (3, 'https://notes.bagerbach.com/gall-s-law'),
   (8,
    'https://notes.bagerbach.com/minimize-time-getting-into-position-just-go'),
   (10, 'https://bagerbach.com/books/the-personal-mb

In [15]:
# calculate IDF for all terms in the vocabulary
import math

N = len(visited)

for key in inverted_index:
    inverted_index[key]["idf"] = math.log(N / inverted_index[key]["frequency"])
    
inverted_index

{'home': {'postings': [(0, 'https://notes.bagerbach.com'),
   (2, 'https://www.wikiwand.com/')],
  'frequency': 2,
  'idf': 6.448097941790546},
 '|': {'postings': [(1, 'https://notes.bagerbach.com'),
   (1, 'https://bagerbach.com/books'),
   (3, 'https://notes.bagerbach.com/the-barbell-strategy'),
   (9, 'https://bagerbach.com/books/the-go-giver'),
   (18, 'https://bagerbach.com/books/why-we-get-fat-and-what-to-do-about-it'),
   (8, 'https://notes.bagerbach.com/pp-6-declaring-types-and-type-classes'),
   (12, 'https://bagerbach.com/books/a-brief-history-of-time'),
   (2, 'https://notes.bagerbach.com/innovation-multiplier'),
   (7,
    'https://notes.bagerbach.com/increase-your-leverage-to-become-more-productive'),
   (13, 'https://bagerbach.com/books/the-richest-man-in-babylon'),
   (2, 'https://notes.bagerbach.com/zero-sum'),
   (3, 'https://notes.bagerbach.com/gall-s-law'),
   (8,
    'https://notes.bagerbach.com/minimize-time-getting-into-position-just-go'),
   (10, 'https://bagerba