In [1]:
# download libraries
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import contractions

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Fellah
[nltk_data]     Faizel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\Fellah
[nltk_data]     Faizel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
# Remove noise, extra spaces, URLs, and symbols.
def clean_text(text):
    #lowercase
    text = text.lower()

    #remove url
    text = re.sub(r'hhtp\S+|www\S+', '', text)

    # remove numbers
    text = re.sub(r'\d+','', text)

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    #remove extra space
    text = re.sub(r'\s+', ' ', text)

    return text

In [3]:
#converting short forms
def expand_contractions(text):
    expanded = contractions.fix(text)
    return expanded

In [4]:
# stop word removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered = [w for w in words if w.lower() not in stop_words]
    return ' '.join(filtered)

In [5]:
def preprocess_text(text, remove_stops=False):
    text = expand_contractions(text)

    text = clean_text(text)

    if remove_stops:
        text = remove_stopwords(text)
    
    return text

In [8]:
samples = [
    "He donâ€™t has any idea about the topic!!!",
    "Iâ€™ll be going to the party at 8pm!!!",
    "This   is   an example with    extra spaces...",
    "Visit my website at https://example.com"
]

for s in samples:
    print("Original:    ", s)
    print("After preprocessing:    ", preprocess_text(s), "\n\n")
    print('-'*40)

Original:     He donâ€™t has any idea about the topic!!!
After preprocessing:     he do not has any idea about the topic 


----------------------------------------
Original:     Iâ€™ll be going to the party at 8pm!!!
After preprocessing:     i will be going to the party at pm 


----------------------------------------
Original:     This   is   an example with    extra spaces...
After preprocessing:     this is an example with extra spaces 


----------------------------------------
Original:     Visit my website at https://example.com
After preprocessing:     visit my website at httpsexamplecom 


----------------------------------------


In [11]:
from textblob import TextBlob
def correct_spelling(text):
     """
    Corrects common spelling mistakes using TextBlob.
    Example: 'Ths is smple txt.' â†’ 'This is simple text.'
    """
     corrected = str(TextBlob(text).correct())
     return corrected 

In [28]:
from nltk.stem import WordNetLemmatizer
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    """
    Converts words to their base form (lemma).
    """
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(lemmas)


[nltk_data] Downloading package wordnet to C:\Users\Fellah
[nltk_data]     Faizel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Fellah
[nltk_data]     Faizel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [29]:
import emoji

def remove_emoji_newlines(text):
    text = emoji.replace_emoji(text, replace='')
    text = text.replace('\n', ' ').replace('\r', ' ')
    return text

In [30]:
def full_preprocess(text, remove_stops=False, spell_check=False):
    """
    Applies full preprocessing: cleaning, expanding, correcting, and lemmatizing.
    """
    text = remove_emoji_newlines(text)
    text = expand_contractions(text)
    text = clean_text(text)
    
    if spell_check:
        text = correct_spelling(text)
        
    if remove_stops:
        text = remove_stopwords(text)
        
    text = lemmatize_text(text)
    return text


In [31]:
samples = [
    "He donâ€™t has any idea ðŸ˜… about the topic!!!",
    "Ths is smple txt for spellng corretion.",
    "I'll be going to the party at 8pm!!!",
    "Runnning and ran are both forms of run."
]

for s in samples:
    print("ðŸ”¹ Original:", s)
    print("âœ… Processed:", full_preprocess(s, spell_check=True))
    print("â€”" * 60)


ðŸ”¹ Original: He donâ€™t has any idea ðŸ˜… about the topic!!!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Fellah Faizel/nltk_data'
    - 'c:\\Users\\Fellah Faizel\\OneDrive\\Desktop\\Fellah Faizel\\PROJECT\\Rephrasinator\\venv\\nltk_data'
    - 'c:\\Users\\Fellah Faizel\\OneDrive\\Desktop\\Fellah Faizel\\PROJECT\\Rephrasinator\\venv\\share\\nltk_data'
    - 'c:\\Users\\Fellah Faizel\\OneDrive\\Desktop\\Fellah Faizel\\PROJECT\\Rephrasinator\\venv\\lib\\nltk_data'
    - 'C:\\Users\\Fellah Faizel\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
