 Theory (Simple):
Text Cleaning: Remove punctuation, lowercase, and keep only words.

Lemmatization: Get base form of words.

Stop Word Removal: Remove common words like "the", "is", etc.

Label Encoding: Convert text labels to numbers.

TF-IDF: Word importance based on frequency and uniqueness.

Save Output: Use pandas to save CSV file.

In [3]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample dataset
data = pd.DataFrame({
    'text': [
        "I love playing football!!!",
        "Movies are amazing and I enjoy watching them.",
        "Football is a great sport.",
        "Reading books is a great habit."
    ],
    'label': ['sports', 'entertainment', 'sports', 'education']
})

# 1. Text Cleaning + Tokenization + Lowercase
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # remove punctuation & lowercase
    tokens = word_tokenize(text)
    return tokens

# 2. Lemmatization + Stopword Removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

# Apply preprocessing
data['clean_tokens'] = data['text'].apply(clean_text).apply(preprocess)

# Join tokens back to sentence
data['clean_text'] = data['clean_tokens'].apply(lambda tokens: ' '.join(tokens))

# 3. Label Encoding
encoder = LabelEncoder()
data['label_encoded'] = encoder.fit_transform(data['label'])

# 4. TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# 5. Save Output to CSV
output = pd.concat([data[['text', 'clean_text', 'label', 'label_encoded']], tfidf_df], axis=1)
output.to_csv("processed_text_output.csv", index=False)

print("✅ Preprocessing complete. File saved as 'processed_text_output.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Preprocessing complete. File saved as 'processed_text_output.csv'
