In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#using sample dataset (20 newsgroups dataset)
dataset = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))



In [None]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Apply NMF to extract features
num_topics = 20  # You can adjust this number based on your needs
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_train = nmf_model.fit_transform(tfidf_train)
nmf_test = nmf_model.transform(tfidf_test)

# Train a classifier (e.g., SVM) on the NMF features
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(nmf_train, y_train)

# Predict on test data
y_pred = svm_classifier.predict(nmf_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred, target_names=dataset.target_names))
