## Import Required modules

In [8]:
import ir_datasets
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle

load the optimized Vocabulary

In [9]:
with open('vocabulary.pkl', 'rb') as file:
    loaded_vocabulary = pickle.load(file)

# Load the dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")

In [10]:
# Step 2: Preprocess the text data
documents = [query.text for query in dataset.queries_iter()]

In [11]:
# Step 3: Create a vocabulary of unigrams and bigrams using the loaded vocabulary
# You can adjust the parameters of CountVectorizer as needed
vectorizer_unigram = CountVectorizer(ngram_range=(1, 1), vocabulary=loaded_vocabulary)
vectorizer_bigram = CountVectorizer(ngram_range=(1, 2), vocabulary=loaded_vocabulary)

# Fit and transform the documents to create BoW representations
X_unigram = vectorizer_unigram.fit_transform(documents)
X_bigram = vectorizer_bigram.fit_transform(documents)




In [12]:
# Step 4: Calculate the BoW representation for each document

# Get the vocabulary for unigrams and bigrams
vocab_unigram = vectorizer_unigram.get_feature_names_out()
vocab_bigram = vectorizer_bigram.get_feature_names_out()

# Convert BoW representations to dense arrays for analysis
X_unigram_dense = X_unigram.toarray()
X_bigram_dense = X_bigram.toarray()

In [13]:
# Step 5: Compare and analyze the two corpus representations

# Calculate sparsity
sparsity_unigram = 1 - (np.count_nonzero(X_unigram_dense) / np.prod(X_unigram_dense.shape))
sparsity_bigram = 1 - (np.count_nonzero(X_bigram_dense) / np.prod(X_bigram_dense.shape))

# Print sparsity results
print("Sparsity for Unigram BoW:", sparsity_unigram)
print("Sparsity for Bigram BoW:", sparsity_bigram)

# Analyze spatial context (co-occurrence) for unigrams and bigrams
# Find the most common bigrams
bigram_counts = np.sum(X_bigram_dense, axis=0)
most_common_bigrams = [(vocab_bigram[i], bigram_counts[i]) for i in np.argsort(bigram_counts)[::-1][:10]]

print("\nTop 10 most common bigrams:")
for bigram, count in most_common_bigrams:
    print(f"{bigram}: {count}")


Sparsity for Unigram BoW: 0.9999975718057509
Sparsity for Bigram BoW: 0.9999975718057509

Top 10 most common bigrams:
should: 33
be: 25
is: 10
the: 10
to: 8
for: 6
legal: 5
in: 5
have: 3
do: 3
