In [1]:
## Import Required modules

In [2]:
import ir_datasets
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker

import pickle

#for multi threading
from multiprocessing import Pool
num_cores = 15

# Load the webis-touche2020 dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")

In [3]:
# set to store the vocabulary terms
vocabulary_terms = set()

initial_tokens_count = 0

# initial tokens before preprocessing
for doc in dataset.docs_iter():
    initial_tokens_count += len(doc.text.split())

print(f"Initial Tokens Count: {initial_tokens_count}")

Initial Tokens Count: 109582718


### Define preprocessing functions

In [4]:

def remove_non_alphanum(text):
    # Remove non-alphanumeric characters
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def text_lowercase(text):
    # Convert text to lowercase
    return text.lower()

def remove_punctuation(text):
    # Remove punctuation
    return re.sub(r'[^\w\s]', '', text)

def remove_whitespace(text):
    # Remove extra whitespaces
    return ' '.join(text.split())

def remove_stopwords(text):
    # Remove stopwords using NLTK's English stopwords list
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def stem_words(text):
    # Stem words using NLTK's Porter Stemmer
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def spell_correction(text):
    # Correct spelling using the SpellChecker library (Not used as resource heavy
    spell = SpellChecker()
    words = text.split()
    corrected_words = []
    
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_words.append(corrected_word)
        else:
            # If the correction is None, keep the original word. small fix
            corrected_words.append(word)
    
    return ' '.join(corrected_words)

In [5]:
preprocessing_functions = [
    (text_lowercase, "Text Lowercase"),
    (remove_whitespace, "Remove Whitespace"), 
    (remove_non_alphanum, "Remove Non-Alphanumeric"), 
    (remove_punctuation, "Remove Punctuation"), 
    (remove_stopwords, "Remove Stopwords"), 
    (stem_words, "Stem Words"),
    (text_lowercase, "Text Lowercase"),# rerunning to make sure everything is lower case :) 
]

In [6]:
# Initialize a set to store the vocabulary terms
vocabulary_terms = set()

# Function to apply a specific preprocessing step to a single document
def apply_preprocessing_step(args):
    doc_id, doc_text, preprocessing_function = args
    preprocessed_text = preprocessing_function(doc_text)
    return doc_id, preprocessed_text

#initialize data for preprocessing
preprocessed_data = [(doc.doc_id, doc.text) for doc in dataset.docs_iter()]

with Pool(num_cores) as pool:
    for preprocessing_function, title in preprocessing_functions:
        # Apply the current preprocessing step to the preprocessed data in parallel
        preprocessed_data = pool.map(apply_preprocessing_step, [(doc_id, doc_text, preprocessing_function) for doc_id, doc_text in preprocessed_data])
        
        # Calculate and print the vocabulary size at this stage
        vocabulary_terms = set()
        vocabulary_terms.update((' '.join(text for _, text in preprocessed_data)).split())
        print(f"{title} - Vocabulary Size: {len(vocabulary_terms)}")

Text Lowercase - Vocabulary Size: 1810550
Remove Whitespace - Vocabulary Size: 1810550
Remove Non-Alphanumeric - Vocabulary Size: 891063
Remove Punctuation - Vocabulary Size: 891063
Remove Stopwords - Vocabulary Size: 890914
Stem Words - Vocabulary Size: 786790
Text Lowercase - Vocabulary Size: 786790


In [7]:


# Print the final vocabulary size after all preprocessing steps
final_vocabulary_size = len(vocabulary_terms)
print(f"Initial Vocabulary(token) size:{initial_tokens_count}")
print(f"Final Vocabulary Size: {final_vocabulary_size}")

percentage_reduction = ((initial_tokens_count - final_vocabulary_size) / initial_tokens_count) * 100
print(f"Percentage Reduction in Vocabulary Size: {percentage_reduction:.2f}%")



Initial Vocabulary(token) size:109582718
Final Vocabulary Size: 786790
Percentage Reduction in Vocabulary Size: 99.28%


Store the optimized Vocabulary

In [8]:
# Save the final vocabulary and preprocessed data to files using pickle
with open('final_vocabulary.pkl', 'wb') as file:
    pickle.dump(vocabulary_terms, file)

with open('preprocessed_data.pkl', 'wb') as file:
    pickle.dump(preprocessed_data, file)

