In [1]:
## Import Required modules

In [2]:
import ir_datasets
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker

# Load the webis-touche2020 dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")

In [None]:
# set to store the vocabulary terms
vocabulary_terms = set()

initial_tokens_count = 0

# initial tokens before preprocessing
for doc in dataset.docs_iter():
    initial_tokens_count += len(doc.text.split())

print(f"Initial Tokens Count: {initial_tokens_count}")

## Define preprocessing functions

In [3]:

def remove_non_alphanum(text):
    # Remove non-alphanumeric characters
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def text_lowercase(text):
    # Convert text to lowercase
    return text.lower()

def remove_punctuation(text):
    # Remove punctuation
    return re.sub(r'[^\w\s]', '', text)

def remove_whitespace(text):
    # Remove extra whitespaces
    return ' '.join(text.split())

def remove_stopwords(text):
    # Remove stopwords using NLTK's English stopwords list
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def stem_words(text):
    # Stem words using NLTK's Porter Stemmer
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def spell_correction(text):
    # Correct spelling using the SpellChecker library
    spell = SpellChecker()
    words = text.split()
    corrected_words = []
    
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_words.append(corrected_word)
        else:
            # If the correction is None, keep the original word
            corrected_words.append(word)
    
    return ' '.join(corrected_words)

In [None]:
# Initialize a set to store the vocabulary terms
vocabulary_terms = set()

# Apply each preprocessing step 
for preprocessing_function, title in [ 
                                       (text_lowercase, "Text Lowercase"),
                                       (remove_non_alphanum, "Remove Non-Alphanumeric"), 
                                       (remove_punctuation, "Remove Punctuation"), 
                                       (remove_whitespace, "Remove Whitespace"), 
                                       (remove_stopwords, "Remove Stopwords"), 
                                       (stem_words, "Stem Words"), 
                                       (spell_correction, "Spell Correction")
                                       ]:
    # reset set
    vocabulary_terms = set()

    # apply particular preprocessing to all docs
    
    for doc in dataset.docs_iter():
        preprocessed_text = doc.text
        preprocessed_text = preprocessing_function(preprocessed_text)
        
        # Update the vocabulary terms with unique terms in the preprocessed text
        vocabulary_terms.update(preprocessed_text.split())
    
    # Print the vocabulary size at this stage
    print(f"{title} - Vocabulary Size: {len(vocabulary_terms)}")

In [None]:
# Print the final vocabulary size after all preprocessing steps
final_vocabulary_size = len(vocabulary_terms)
print(f"Final Vocabulary Size: {final_vocabulary_size}")