In [1]:
## Import Required modules

In [2]:
import ir_datasets
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
#for multi threading
from multiprocessing import Pool
num_cores = 15

# Load the webis-touche2020 dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")

In [3]:
# set to store the vocabulary terms
vocabulary_terms = set()

initial_tokens_count = 0

# initial tokens before preprocessing
for doc in dataset.docs_iter():
    initial_tokens_count += len(doc.text.split())

print(f"Initial Tokens Count: {initial_tokens_count}")

Initial Tokens Count: 109582718


## Define preprocessing functions

In [4]:

def remove_non_alphanum(text):
    # Remove non-alphanumeric characters
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def text_lowercase(text):
    # Convert text to lowercase
    return text.lower()

def remove_punctuation(text):
    # Remove punctuation
    return re.sub(r'[^\w\s]', '', text)

def remove_whitespace(text):
    # Remove extra whitespaces
    return ' '.join(text.split())

def remove_stopwords(text):
    # Remove stopwords using NLTK's English stopwords list
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def stem_words(text):
    # Stem words using NLTK's Porter Stemmer
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def spell_correction(text):
    # Correct spelling using the SpellChecker library (Not used as resource heavy
    spell = SpellChecker()
    words = text.split()
    corrected_words = []
    
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_words.append(corrected_word)
        else:
            # If the correction is None, keep the original word. small fix
            corrected_words.append(word)
    
    return ' '.join(corrected_words)

In [5]:
# Initialize a set to store the vocabulary terms
vocabulary_terms = set()

# Function to apply a specific preprocessing step to a single document
def apply_preprocessing_step(doc, preprocessing_function):
    preprocessed_text = doc.text
    preprocessed_text = preprocessing_function(preprocessed_text)
    return preprocessed_text
with Pool(num_cores) as pool:
    for preprocessing_function, title in [(remove_non_alphanum, "Remove Non-Alphanumeric"), 
                                          (text_lowercase, "Text Lowercase"), 
                                          (remove_punctuation, "Remove Punctuation"), 
                                          (remove_whitespace, "Remove Whitespace"), 
                                          (remove_stopwords, "Remove Stopwords"), 
                                          (stem_words, "Stem Words")
                                          ]:
        # reset set
        vocabulary_terms = set()
        # Apply the current preprocessing step to all documents in parallel
        preprocessed_texts = pool.starmap(apply_preprocessing_step, [(doc, preprocessing_function) for doc in dataset.docs_iter()])
        
        # Calculate and print the vocabulary size at this stage
        vocabulary_terms.update(' '.join(preprocessed_texts).split())
        print(f"{title} - Vocabulary Size: {len(vocabulary_terms)}")

Remove Non-Alphanumeric - Vocabulary Size: 1028959
Text Lowercase - Vocabulary Size: 1810550
Remove Punctuation - Vocabulary Size: 1035503
Remove Whitespace - Vocabulary Size: 2021510
Remove Stopwords - Vocabulary Size: 2020727
Stem Words - Vocabulary Size: 1714552


In [8]:
# Print the final vocabulary size after all preprocessing steps
final_vocabulary_size = len(vocabulary_terms)
print(f"Initial Vocabulary(token) size:{initial_tokens_count}")
print(f"Final Vocabulary Size: {final_vocabulary_size}")

percentage_reduction = ((initial_tokens_count - final_vocabulary_size) / initial_tokens_count) * 100
print(f"Percentage Reduction in Vocabulary Size: {percentage_reduction:.2f}%")



Initial Vocabulary(token) size:109582718
Final Vocabulary Size: 1714552
Percentage Reduction in Vocabulary Size: 98.44%
