In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib  # For saving models

# Download necessary NLTK data (Run this once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
data = {
    'text': [
        "The quick brown fox jumps over the lazy dog!",
        "I am loving the new Python updates...",
        "Data science is amazing and fun.",
        "Dogs are the best pets in the world.",
        "The fox is quick and the dog is lazy."
    ],
    'category': ['animals', 'tech', 'tech', 'animals', 'animals']
}
df = pd.DataFrame(data)

print("Original Data:")
print(df.head())
print("-" * 30)

Original Data:
                                           text category
0  The quick brown fox jumps over the lazy dog!  animals
1         I am loving the new Python updates...     tech
2              Data science is amazing and fun.     tech
3          Dogs are the best pets in the world.  animals
4         The fox is quick and the dog is lazy.  animals
------------------------------


In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Text Cleaning: Lowercase and remove punctuation/special characters
    text = re.sub(r'[^\w\s]', '', text.lower())

    # 2. Tokenization (splitting strings into words)
    words = text.split()

    # 3. Stop Word Removal & Lemmatization
    # We keep words NOT in stop_words, and lemmatize the rest
    clean_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join back into a string
    return " ".join(clean_words)

In [4]:
df['clean_text'] = df['text'].apply(preprocess_text)

print("Preprocessed Data:")
print(df[['text', 'clean_text']].head())
print("-" * 30)

Preprocessed Data:
                                           text                     clean_text
0  The quick brown fox jumps over the lazy dog!  quick brown fox jump lazy dog
1         I am loving the new Python updates...       loving new python update
2              Data science is amazing and fun.       data science amazing fun
3          Dogs are the best pets in the world.             dog best pet world
4         The fox is quick and the dog is lazy.             fox quick dog lazy
------------------------------


In [5]:
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])

print("Label Encoded Data:")
print(df[['category', 'category_encoded']].head())
print("-" * 30)

Label Encoded Data:
  category  category_encoded
0  animals                 0
1     tech                 1
2     tech                 1
3  animals                 0
4  animals                 0
------------------------------


In [8]:
tfidf = TfidfVectorizer(max_features=100) # Limiting features for demo

# Fit and transform the clean text
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

# Convert to DataFrame for visualization (Optional, good for assignments)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

print("TF-IDF Matrix (First 5 rows):")
print(tfidf_df.head())

TF-IDF Matrix (First 5 rows):
   amazing      best     brown  data       dog       fox  fun      jump  \
0      0.0  0.000000  0.476663   0.0  0.319227  0.384569  0.0  0.476663   
1      0.0  0.000000  0.000000   0.0  0.000000  0.000000  0.0  0.000000   
2      0.5  0.000000  0.000000   0.5  0.000000  0.000000  0.5  0.000000   
3      0.0  0.538498  0.000000   0.0  0.360638  0.000000  0.0  0.000000   
4      0.0  0.000000  0.000000   0.0  0.432183  0.520646  0.0  0.000000   

       lazy  loving  new       pet  python     quick  science  update  \
0  0.384569     0.0  0.0  0.000000     0.0  0.384569      0.0     0.0   
1  0.000000     0.5  0.5  0.000000     0.5  0.000000      0.0     0.5   
2  0.000000     0.0  0.0  0.000000     0.0  0.000000      0.5     0.0   
3  0.000000     0.0  0.0  0.538498     0.0  0.000000      0.0     0.0   
4  0.520646     0.0  0.0  0.000000     0.0  0.520646      0.0     0.0   

      world  
0  0.000000  
1  0.000000  
2  0.000000  
3  0.538498  
4  0.00000

In [9]:
df.to_csv('processed_dataset.csv', index=False)

# 2. Save the TF-IDF Vectorizer and Label Encoder for future use
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("\nProcessing complete. Files 'processed_dataset.csv', 'tfidf_vectorizer.pkl', and 'label_encoder.pkl' have been saved.")


Processing complete. Files 'processed_dataset.csv', 'tfidf_vectorizer.pkl', and 'label_encoder.pkl' have been saved.
