In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess_with_pos(text, pos_to_include=None):
    """
    Preprocess text and filter tokens by specified POS tags.

    Args:
    - text (str): The input text to preprocess.
    - pos_to_include (set): A set of POS tags to retain (e.g., {"NOUN", "VERB", "ADJ"}).

    Returns:
    - List of tuples: Each tuple contains a token's lemma and its POS tag.
    """
    doc = nlp(text)

    if pos_to_include is None:
        pos_to_include = {"NOUN", "VERB", "ADJ", "ADV"}

    filtered_tokens = []

    for token in doc:

        if token.pos_ in pos_to_include and not token.is_stop and token.is_alpha:

            filtered_tokens.append((token.lemma_.lower(), token.pos_))

    return filtered_tokens


text = "Text preprocessing involves cleaning and preparing text for analysis. It is essential for NLP tasks!"
pos_to_include = {"NOUN", "VERB", "ADJ"}
processed_tokens = preprocess_with_pos(text, pos_to_include)


print("Processed Tokens with POS Tags:")
for lemma, pos in processed_tokens:
    print(f"{lemma} ({pos})")


Processed Tokens with POS Tags:
text (NOUN)
preprocessing (NOUN)
involve (VERB)
clean (VERB)
prepare (VERB)
text (NOUN)
analysis (NOUN)
essential (ADJ)
task (NOUN)
