In [13]:
# Perform text cleaning, perform lemmatization (any method), remove stop words (any method), 
# label encoding. Create representations using TF-IDF. Save outputs.  

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pickle

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Sample data
data = {'Text': [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
],
    'Label': ['A', 'B', 'C', 'D']}

df = pd.DataFrame(data)

# Text Cleaning and Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Label Encoding
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Cleaned_Text'])

# Save Outputs
df.to_csv('cleaned_data.csv', index=False)
with open('tfidf_matrix.pkl', 'wb') as tfidf_file:
    pickle.dump(X_tfidf, tfidf_file)

# df.to_csv('cleaned_data.csv', index=False)
# with open('tfidf_matrix.pkl', 'rb') as tfidf_file:
#     data = pickle.load(tfidf_file)
# print(data)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
