In [23]:
import os
import re
import builtins
from bs4 import BeautifulSoup
from string import punctuation
from collections import Counter
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [24]:
# Function to tokenize and process the text
def process_text(text):
    tokens = wordpunct_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words("english") and token not in punctuation]
    return tokens

# Question 1 

In [37]:
que1a = ["cranfield0727", "cranfield0877", "cranfield0895"]
que1b = ["semivertex", "replaced", "entrance"]
que1c = ["desired", "eliminated", "comprised"]

# que1a = ['cranfield0633', 'cranfield1346', 'cranfield0092']
# que1b = ['torsion', 'meridional' , 'reducing']
# que1c = ['tolerance', 'convergent', 'constraint']

In [26]:
# Function to tokenize and lemmatize the text
def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    soup = BeautifulSoup(text, 'html.parser')
    title = soup.find('title').get_text() if soup.find('title') else ''
    text = soup.find('text').get_text() if soup.find('text') else ''

    # Tokenize title and text
    tokens = process_text(title) + process_text(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Function to process each document and update the index
def process_document(filename, doc_id, index):
    with builtins.open(f'Cranfield/{filename}', 'r') as file:
        content = file.read()
        tokens = tokenize_and_lemmatize(content)

        # Update document-level information
        max_tf = max(tokens.count(lemma) for lemma in set(tokens))
        doclen = len(tokens)
        unique_terms = len(set(tokens))
        
        if filename in que1a:
            print(filename)
            print(f"doclen: The total number of lemmas in the document: {doclen}")
            print(f"max_tf: The frequency of the most frequent lemma in the document: {max_tf}")
            print(f"unique_terms: The number of unique lemmas in the document: {unique_terms}")
            print("="*50)

        # Update lemma-level information
        for lemma in set(tokens):
            index[lemma]['df'] += 1
            if doc_id not in index[lemma]['documents']:
                index[lemma]['documents'].append(doc_id)
            index[lemma]['posting_list'].append({
                'doc_id': doc_id,
                'tf': tokens.count(lemma),
                'max_tf': max_tf,
                'doclen': doclen,
                'unique_terms': unique_terms
            })
            
# Initialize the index
index = defaultdict(lambda: {'df': 0, 'documents': [], 'posting_list': []})
print("Question 1A")
print('-'*50)


# Path to the Cranfield collection
cranfield_path = 'Cranfield/'

# Iterate over each document in the Cranfield collection
for filename in os.listdir(cranfield_path):
    if filename.startswith('cranfield'):
        doc_id = int(re.search(r'\d+', filename).group())  # Extract document ID from the filename
        process_document(filename, doc_id, index)

Question 1A
--------------------------------------------------
cranfield0727
doclen: The total number of lemmas in the document: 120
max_tf: The frequency of the most frequent lemma in the document: 6
unique_terms: The number of unique lemmas in the document: 78
cranfield0877
doclen: The total number of lemmas in the document: 38
max_tf: The frequency of the most frequent lemma in the document: 4
unique_terms: The number of unique lemmas in the document: 25
cranfield0895
doclen: The total number of lemmas in the document: 60
max_tf: The frequency of the most frequent lemma in the document: 4
unique_terms: The number of unique lemmas in the document: 41


In [27]:
print("To backtrack Que1B")
print('-'*50)
for lemma in que1b:
    print(lemma)
    print(index[lemma])
    print("="*50)

To backtrack Que1B
--------------------------------------------------
semivertex
{'df': 8, 'documents': [63, 522, 814, 936, 937, 947, 1001, 1231], 'posting_list': [{'doc_id': 63, 'tf': 1, 'max_tf': 4, 'doclen': 94, 'unique_terms': 65}, {'doc_id': 522, 'tf': 1, 'max_tf': 6, 'doclen': 229, 'unique_terms': 125}, {'doc_id': 814, 'tf': 1, 'max_tf': 5, 'doclen': 123, 'unique_terms': 82}, {'doc_id': 936, 'tf': 3, 'max_tf': 8, 'doclen': 90, 'unique_terms': 62}, {'doc_id': 937, 'tf': 1, 'max_tf': 4, 'doclen': 68, 'unique_terms': 51}, {'doc_id': 947, 'tf': 2, 'max_tf': 11, 'doclen': 198, 'unique_terms': 94}, {'doc_id': 1001, 'tf': 2, 'max_tf': 4, 'doclen': 79, 'unique_terms': 53}, {'doc_id': 1231, 'tf': 1, 'max_tf': 5, 'doclen': 102, 'unique_terms': 69}]}
replaced
{'df': 17, 'documents': [25, 101, 115, 180, 227, 332, 435, 493, 601, 624, 683, 765, 808, 842, 962, 1375, 1382], 'posting_list': [{'doc_id': 25, 'tf': 1, 'max_tf': 8, 'doclen': 223, 'unique_terms': 145}, {'doc_id': 101, 'tf': 1, 'max_tf

In [28]:
print("To backtrack Que1C")
print('-'*50)
for lemma in que1c:
    print(lemma)
    print(index[lemma])
    print("="*50)

To backtrack Que1C
--------------------------------------------------
desired
{'df': 8, 'documents': [17, 244, 441, 659, 679, 738, 1224, 1377], 'posting_list': [{'doc_id': 17, 'tf': 1, 'max_tf': 4, 'doclen': 79, 'unique_terms': 60}, {'doc_id': 244, 'tf': 1, 'max_tf': 19, 'doclen': 261, 'unique_terms': 184}, {'doc_id': 441, 'tf': 1, 'max_tf': 7, 'doclen': 161, 'unique_terms': 108}, {'doc_id': 659, 'tf': 1, 'max_tf': 6, 'doclen': 76, 'unique_terms': 54}, {'doc_id': 679, 'tf': 1, 'max_tf': 5, 'doclen': 97, 'unique_terms': 69}, {'doc_id': 738, 'tf': 1, 'max_tf': 7, 'doclen': 84, 'unique_terms': 59}, {'doc_id': 1224, 'tf': 1, 'max_tf': 4, 'doclen': 144, 'unique_terms': 112}, {'doc_id': 1377, 'tf': 1, 'max_tf': 3, 'doclen': 60, 'unique_terms': 52}]}
eliminated
{'df': 9, 'documents': [342, 364, 595, 643, 849, 873, 933, 1196, 1345], 'posting_list': [{'doc_id': 342, 'tf': 1, 'max_tf': 7, 'doclen': 129, 'unique_terms': 97}, {'doc_id': 364, 'tf': 1, 'max_tf': 8, 'doclen': 214, 'unique_terms': 130

In [29]:
print("Question 1B")
print('-'*50)

for term in que1b:
    print(f"lemma: {term}")
    print(f"df: The number of documents that the lemma occurs in: {index[term]['df']}")
    print(f"The first document ID in the posting list of the lemma: {index[term]['documents'][0]}")
    print(f"The first document's tf in the posting list of the lemma: {index[term]['posting_list'][0]['tf']}")
    print(f"The last document ID in the posting list of the lemma: {index[term]['documents'][-1]}")
    print(f"The last document's tf in the posting list of the lemma: {index[term]['posting_list'][-1]['tf']}")
    print('='*50)

Question 1B
--------------------------------------------------
lemma: semivertex
df: The number of documents that the lemma occurs in: 8
The first document ID in the posting list of the lemma: 63
The first document's tf in the posting list of the lemma: 1
The last document ID in the posting list of the lemma: 1231
The last document's tf in the posting list of the lemma: 1
lemma: replaced
df: The number of documents that the lemma occurs in: 17
The first document ID in the posting list of the lemma: 25
The first document's tf in the posting list of the lemma: 1
The last document ID in the posting list of the lemma: 1382
The last document's tf in the posting list of the lemma: 2
lemma: entrance
df: The number of documents that the lemma occurs in: 4
The first document ID in the posting list of the lemma: 340
The first document's tf in the posting list of the lemma: 2
The last document ID in the posting list of the lemma: 1139
The last document's tf in the posting list of the lemma: 2


In [30]:
# Function to convert posting list to compressed format
def compress_posting_list(posting_list):
    compressed_list = [posting_list[0]]  # First element remains unchanged
    for i in range(1, len(posting_list)):
        gap = posting_list[i] - posting_list[i-1]
        compressed_list.append(gap)
    return compressed_list

# Convert all posting lists in the index to compressed format
for lemma in index:
    index[lemma]['documents'] = compress_posting_list(index[lemma]['documents'])

In [31]:
print("To backtrack Que1C")
print('-'*50)
for lemma in que1c:
    print(lemma)
    print(index[lemma])
    print("="*50)

To backtrack Que1C
--------------------------------------------------
desired
{'df': 8, 'documents': [17, 227, 197, 218, 20, 59, 486, 153], 'posting_list': [{'doc_id': 17, 'tf': 1, 'max_tf': 4, 'doclen': 79, 'unique_terms': 60}, {'doc_id': 244, 'tf': 1, 'max_tf': 19, 'doclen': 261, 'unique_terms': 184}, {'doc_id': 441, 'tf': 1, 'max_tf': 7, 'doclen': 161, 'unique_terms': 108}, {'doc_id': 659, 'tf': 1, 'max_tf': 6, 'doclen': 76, 'unique_terms': 54}, {'doc_id': 679, 'tf': 1, 'max_tf': 5, 'doclen': 97, 'unique_terms': 69}, {'doc_id': 738, 'tf': 1, 'max_tf': 7, 'doclen': 84, 'unique_terms': 59}, {'doc_id': 1224, 'tf': 1, 'max_tf': 4, 'doclen': 144, 'unique_terms': 112}, {'doc_id': 1377, 'tf': 1, 'max_tf': 3, 'doclen': 60, 'unique_terms': 52}]}
eliminated
{'df': 9, 'documents': [342, 22, 231, 48, 206, 24, 60, 263, 149], 'posting_list': [{'doc_id': 342, 'tf': 1, 'max_tf': 7, 'doclen': 129, 'unique_terms': 97}, {'doc_id': 364, 'tf': 1, 'max_tf': 8, 'doclen': 214, 'unique_terms': 130}, {'doc_i

In [32]:
# Function to perform Gamma encoding for a given integer
def gamma_encode(number):
    binary_rep = bin(number)[2:]
    offset = binary_rep[1:]
    offset_length = len(offset)
    unary_code = '1' * offset_length + '0'
    gamma_code = unary_code + offset
    return gamma_code

# Function to perform Gamma encoding on a gap compressed posting list
def gamma_encode_posting_list(posting_list):
    gamma_encoded_list = []
    for gap in posting_list:
        # Gamma encode the gap value and append to the encoded list
        gamma_encoded_gap = gamma_encode(gap)
        gamma_encoded_list.append(gamma_encoded_gap)
    return gamma_encoded_list

# Apply gamma encoding to the gap compressed posting lists in the inverted index
for lemma in index:
    index[lemma]['documents'] = gamma_encode_posting_list(index[lemma]['documents'])

In [33]:
print("To backtrack Que1C")
print('-'*50)
for lemma in que1c:
    print(lemma)
    print(index[lemma])
    print("="*50)

To backtrack Que1C
--------------------------------------------------
desired
{'df': 8, 'documents': ['111100001', '111111101100011', '111111101000101', '111111101011010', '111100100', '11111011011', '11111111011100110', '111111100011001'], 'posting_list': [{'doc_id': 17, 'tf': 1, 'max_tf': 4, 'doclen': 79, 'unique_terms': 60}, {'doc_id': 244, 'tf': 1, 'max_tf': 19, 'doclen': 261, 'unique_terms': 184}, {'doc_id': 441, 'tf': 1, 'max_tf': 7, 'doclen': 161, 'unique_terms': 108}, {'doc_id': 659, 'tf': 1, 'max_tf': 6, 'doclen': 76, 'unique_terms': 54}, {'doc_id': 679, 'tf': 1, 'max_tf': 5, 'doclen': 97, 'unique_terms': 69}, {'doc_id': 738, 'tf': 1, 'max_tf': 7, 'doclen': 84, 'unique_terms': 59}, {'doc_id': 1224, 'tf': 1, 'max_tf': 4, 'doclen': 144, 'unique_terms': 112}, {'doc_id': 1377, 'tf': 1, 'max_tf': 3, 'doclen': 60, 'unique_terms': 52}]}
eliminated
{'df': 9, 'documents': ['11111111001010110', '111100110', '111111101100111', '11111010000', '111111101001110', '111101000', '11111011100',

In [38]:
print("Question 1C")
print('-'*50)

for lemma in que1c:
    print(f"lemma: {lemma}")
    print(f"The first gamma compressed gap (the first document ID) in the posting list of the lemma: {index[lemma]['documents'][0]}")
    print(f"The second gamma compressed gap in the posting list of the lemma: {index[lemma]['documents'][1]}")
    print(f"The last gamma compressed gap in the posting list of the lemma: {index[lemma]['documents'][-1]}")
    print('='*50)

Question 1C
--------------------------------------------------
lemma: desired
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 111100001
The second gamma compressed gap in the posting list of the lemma: 111111101100011
The last gamma compressed gap in the posting list of the lemma: 111111100011001
lemma: eliminated
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 11111111001010110
The second gamma compressed gap in the posting list of the lemma: 111100110
The last gamma compressed gap in the posting list of the lemma: 111111100010101
lemma: comprised
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 1111110001110
The second gamma compressed gap in the posting list of the lemma: 1111110101011
The last gamma compressed gap in the posting list of the lemma: 111111100110010


# Question 2

In [39]:
que2a = ["cranfield0697", "cranfield0971", "cranfield0619"]
que2b = ["immedi", "content", "cruciform"]
que2c = ["disagr", "loss", "accommod"]

# que2a = ['cranfield0449', 'cranfield0093', 'cranfield1178']
# que2b = ['reflect', 'acceler', 'unit']
# que2c = ['irrevers', 'crosssect', 'block']

In [40]:
# Function to process each document and update the index
def get_stems(filename, doc_id, index):
    with builtins.open(f'Cranfield/{filename}', 'r') as file:
        content = file.read()
        stemmer = PorterStemmer()
        # Parse the content using BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        title = soup.find('title').get_text() if soup.find('title') else ''
        text = soup.find('text').get_text() if soup.find('text') else ''
        
        # Process title and text separately
        title_tokens = process_text(title)
        text_tokens = process_text(text)

        # Combine title and text tokens for each document
        document_tokens = title_tokens + text_tokens
        
        document_stems = [stemmer.stem(token) for token in document_tokens]

        max_tf = max(document_stems.count(stem) for stem in set(document_stems))
        doclen = len(document_stems)
        unique_terms = len(set(document_stems))
        
        if filename in que2a:
            print(filename)
            print(f"doclen: The total number of stems in the document: {doclen}")
            print(f"max_tf: The frequency of the most frequent stem in the document: {max_tf}")
            print(f"unique_terms: The number of unique stems in the document: {unique_terms}")
            print("="*50)
        
        
        # Update posting list for each stem
        for stem in set(document_stems):
            index_stem[stem]['df'] += 1
            if doc_id not in index_stem[stem]['documents']:
                index_stem[stem]['documents'].append(doc_id)
            index_stem[stem]['posting_list'].append({
                'doc_id': doc_id,
                'tf': document_stems.count(stem),
                'max_tf': max_tf,
                'doclen': doclen,
                'unique_terms': unique_terms
            })
        
# Initialize the index
index_stem = defaultdict(lambda: {'df': 0, 'documents': [], 'posting_list': []})
print("Question 2A")
print('-'*50)

# Path to the Cranfield collection
cranfield_path = 'Cranfield/'

# Iterate over each document in the Cranfield collection
for filename in os.listdir(cranfield_path):
    if filename.startswith('cranfield'):
        doc_id = int(re.search(r'\d+', filename).group())  # Extract document ID from the filename
        get_stems(filename, doc_id, index)

Question 2A
--------------------------------------------------
cranfield0619
doclen: The total number of stems in the document: 25
max_tf: The frequency of the most frequent stem in the document: 2
unique_terms: The number of unique stems in the document: 22
cranfield0697
doclen: The total number of stems in the document: 100
max_tf: The frequency of the most frequent stem in the document: 11
unique_terms: The number of unique stems in the document: 55
cranfield0971
doclen: The total number of stems in the document: 92
max_tf: The frequency of the most frequent stem in the document: 8
unique_terms: The number of unique stems in the document: 49


In [41]:
print("To backtrack Que2B")
print('-'*50)
for stem in que2b:
    print(stem)
    print(index_stem[stem])
    print("="*50)

To backtrack Que2B
--------------------------------------------------
immedi
{'df': 18, 'documents': [58, 135, 149, 213, 352, 439, 579, 625, 667, 689, 721, 869, 907, 968, 989, 1088, 1313, 1342], 'posting_list': [{'doc_id': 58, 'tf': 1, 'max_tf': 6, 'doclen': 107, 'unique_terms': 71}, {'doc_id': 135, 'tf': 1, 'max_tf': 6, 'doclen': 99, 'unique_terms': 57}, {'doc_id': 149, 'tf': 1, 'max_tf': 10, 'doclen': 149, 'unique_terms': 97}, {'doc_id': 213, 'tf': 1, 'max_tf': 8, 'doclen': 168, 'unique_terms': 116}, {'doc_id': 352, 'tf': 1, 'max_tf': 5, 'doclen': 120, 'unique_terms': 88}, {'doc_id': 439, 'tf': 1, 'max_tf': 8, 'doclen': 110, 'unique_terms': 66}, {'doc_id': 579, 'tf': 1, 'max_tf': 6, 'doclen': 108, 'unique_terms': 73}, {'doc_id': 625, 'tf': 1, 'max_tf': 7, 'doclen': 194, 'unique_terms': 130}, {'doc_id': 667, 'tf': 2, 'max_tf': 9, 'doclen': 147, 'unique_terms': 95}, {'doc_id': 689, 'tf': 1, 'max_tf': 11, 'doclen': 160, 'unique_terms': 89}, {'doc_id': 721, 'tf': 1, 'max_tf': 15, 'doclen

In [42]:
print("To backtrack Que2C")
print('-'*50)
for stem in que2c:
    print(stem)
    print(index_stem[stem])
    print("="*50)

To backtrack Que2C
--------------------------------------------------
disagr
{'df': 7, 'documents': [139, 276, 455, 927, 1302, 1352, 1370], 'posting_list': [{'doc_id': 139, 'tf': 1, 'max_tf': 5, 'doclen': 78, 'unique_terms': 54}, {'doc_id': 276, 'tf': 1, 'max_tf': 6, 'doclen': 111, 'unique_terms': 67}, {'doc_id': 455, 'tf': 1, 'max_tf': 6, 'doclen': 118, 'unique_terms': 78}, {'doc_id': 927, 'tf': 1, 'max_tf': 12, 'doclen': 255, 'unique_terms': 140}, {'doc_id': 1302, 'tf': 1, 'max_tf': 6, 'doclen': 104, 'unique_terms': 73}, {'doc_id': 1352, 'tf': 1, 'max_tf': 11, 'doclen': 138, 'unique_terms': 75}, {'doc_id': 1370, 'tf': 1, 'max_tf': 9, 'doclen': 126, 'unique_terms': 83}]}
loss
{'df': 23, 'documents': [14, 177, 213, 214, 216, 219, 245, 258, 274, 426, 427, 511, 548, 603, 877, 1089, 1093, 1094, 1195, 1226, 1277, 1325, 1349], 'posting_list': [{'doc_id': 14, 'tf': 1, 'max_tf': 8, 'doclen': 224, 'unique_terms': 174}, {'doc_id': 177, 'tf': 1, 'max_tf': 11, 'doclen': 125, 'unique_terms': 83}, 

In [43]:
print("Question 2B")
print('-'*50)

for term in que2b:
    print(f"lemma: {term}")
    print(f"df: The number of documents that the lemma occurs in: {index_stem[term]['df']}")
    print(f"The first document ID in the posting list of the lemma: {index_stem[term]['documents'][0]}")
    print(f"The first document's tf in the posting list of the lemma: {index_stem[term]['posting_list'][0]['tf']}")
    print(f"The last document ID in the posting list of the lemma: {index_stem[term]['documents'][-1]}")
    print(f"The last document's tf in the posting list of the lemma: {index_stem[term]['posting_list'][-1]['tf']}")
    print('='*50)

Question 2B
--------------------------------------------------
lemma: immedi
df: The number of documents that the lemma occurs in: 18
The first document ID in the posting list of the lemma: 58
The first document's tf in the posting list of the lemma: 1
The last document ID in the posting list of the lemma: 1342
The last document's tf in the posting list of the lemma: 1
lemma: content
df: The number of documents that the lemma occurs in: 4
The first document ID in the posting list of the lemma: 579
The first document's tf in the posting list of the lemma: 1
The last document ID in the posting list of the lemma: 1096
The last document's tf in the posting list of the lemma: 1
lemma: cruciform
df: The number of documents that the lemma occurs in: 8
The first document ID in the posting list of the lemma: 229
The first document's tf in the posting list of the lemma: 4
The last document ID in the posting list of the lemma: 1202
The last document's tf in the posting list of the lemma: 3


In [44]:
# Function to convert posting list to compressed format
def compress_posting_list(posting_list):
    compressed_list = [posting_list[0]]  # First element remains unchanged
    for i in range(1, len(posting_list)):
        gap = posting_list[i] - posting_list[i-1]
        compressed_list.append(gap)
    return compressed_list

# Convert all posting lists in the index to compressed format
for stem in index_stem:
    index_stem[stem]['documents'] = compress_posting_list(index_stem[stem]['documents'])

In [45]:
print("To backtrack Que2C")
print('-'*50)
for stem in que2c:
    print(stem)
    print(index_stem[stem])
    print("="*50)

To backtrack Que2C
--------------------------------------------------
disagr
{'df': 7, 'documents': [139, 137, 179, 472, 375, 50, 18], 'posting_list': [{'doc_id': 139, 'tf': 1, 'max_tf': 5, 'doclen': 78, 'unique_terms': 54}, {'doc_id': 276, 'tf': 1, 'max_tf': 6, 'doclen': 111, 'unique_terms': 67}, {'doc_id': 455, 'tf': 1, 'max_tf': 6, 'doclen': 118, 'unique_terms': 78}, {'doc_id': 927, 'tf': 1, 'max_tf': 12, 'doclen': 255, 'unique_terms': 140}, {'doc_id': 1302, 'tf': 1, 'max_tf': 6, 'doclen': 104, 'unique_terms': 73}, {'doc_id': 1352, 'tf': 1, 'max_tf': 11, 'doclen': 138, 'unique_terms': 75}, {'doc_id': 1370, 'tf': 1, 'max_tf': 9, 'doclen': 126, 'unique_terms': 83}]}
loss
{'df': 23, 'documents': [14, 163, 36, 1, 2, 3, 26, 13, 16, 152, 1, 84, 37, 55, 274, 212, 4, 1, 101, 31, 51, 48, 24], 'posting_list': [{'doc_id': 14, 'tf': 1, 'max_tf': 8, 'doclen': 224, 'unique_terms': 174}, {'doc_id': 177, 'tf': 1, 'max_tf': 11, 'doclen': 125, 'unique_terms': 83}, {'doc_id': 213, 'tf': 1, 'max_tf': 8

In [46]:
# Function to perform Gamma encoding for a given integer
def gamma_encode(number):
    binary_rep = bin(number)[2:]
    offset = binary_rep[1:]
    offset_length = len(offset)
    unary_code = '1' * offset_length + '0'
    gamma_code = unary_code + offset
    return gamma_code

# Function to perform Gamma encoding on a gap compressed posting list
def gamma_encode_posting_list(posting_list):
    gamma_encoded_list = []
    for gap in posting_list:
        # Gamma encode the gap value and append to the encoded list
        gamma_encoded_gap = gamma_encode(gap)
        gamma_encoded_list.append(gamma_encoded_gap)
    return gamma_encoded_list

# Apply gamma encoding to the gap compressed posting lists in the inverted index
for stem in index_stem:
    index_stem[stem]['documents'] = gamma_encode_posting_list(index_stem[stem]['documents'])

In [47]:
print("To backtrack Que2C")
print('-'*50)
for stem in que2c:
    print(stem)
    print(index_stem[stem])
    print("="*50)

To backtrack Que2C
--------------------------------------------------
disagr
{'df': 7, 'documents': ['111111100001011', '111111100001001', '111111100110011', '11111111011011000', '11111111001110111', '11111010010', '111100010'], 'posting_list': [{'doc_id': 139, 'tf': 1, 'max_tf': 5, 'doclen': 78, 'unique_terms': 54}, {'doc_id': 276, 'tf': 1, 'max_tf': 6, 'doclen': 111, 'unique_terms': 67}, {'doc_id': 455, 'tf': 1, 'max_tf': 6, 'doclen': 118, 'unique_terms': 78}, {'doc_id': 927, 'tf': 1, 'max_tf': 12, 'doclen': 255, 'unique_terms': 140}, {'doc_id': 1302, 'tf': 1, 'max_tf': 6, 'doclen': 104, 'unique_terms': 73}, {'doc_id': 1352, 'tf': 1, 'max_tf': 11, 'doclen': 138, 'unique_terms': 75}, {'doc_id': 1370, 'tf': 1, 'max_tf': 9, 'doclen': 126, 'unique_terms': 83}]}
loss
{'df': 23, 'documents': ['1110110', '111111100100011', '11111000100', '0', '100', '101', '111101010', '1110101', '111100000', '111111100011000', '0', '1111110010100', '11111000101', '11111010111', '11111111000010010', '111111

In [48]:
print("Question 2C")
print('-'*50)

for term in que2c:
    print(f"lemma: {term}")
    print(f"The first gamma compressed gap (the first document ID) in the posting list of the lemma: {index_stem[term]['documents'][0]}")
    print(f"The second gamma compressed gap in the posting list of the lemma: {index_stem[term]['documents'][1]}")
    print(f"The last gamma compressed gap in the posting list of the lemma: {index_stem[term]['documents'][-1]}")
    print('='*50)

Question 2C
--------------------------------------------------
lemma: disagr
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 111111100001011
The second gamma compressed gap in the posting list of the lemma: 111111100001001
The last gamma compressed gap in the posting list of the lemma: 111100010
lemma: loss
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 1110110
The second gamma compressed gap in the posting list of the lemma: 111111100100011
The last gamma compressed gap in the posting list of the lemma: 111101000
lemma: accommod
The first gamma compressed gap (the first document ID) in the posting list of the lemma: 111111100101000
The second gamma compressed gap in the posting list of the lemma: 11111111001011110
The last gamma compressed gap in the posting list of the lemma: 1111110011100
