In [None]:
from sklearn.datasets import fetch_20newsgroups
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords and initialize stemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Load the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
0# Text preprocessing
def preprocess_text(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

preprocessed_data = [preprocess_text(text) for text in newsgroups_data.data]


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_data)


In [None]:
# Apply SVD (LSI)
num_topics = 100
lsa = TruncatedSVD(n_components=num_topics)
lsa_matrix = lsa.fit_transform(tfidf_matrix)


In [None]:
# Top terms for each topic
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(num_topics):
    top_terms_idx = lsa.components_[i].argsort()[-10:][::-1]
    top_terms = [terms[idx] for idx in top_terms_idx]
    print(f"Topic {i + 1}: {', '.join(top_terms)}")


Topic 1: use, like, know, think, peopl, time, make, say, good, work
Topic 2: window, use, file, card, drive, thank, pleas, program, mail, anyon
Topic 3: game, team, year, play, player, car, hockey, season, win, fan
Topic 4: pleas, game, thank, mail, anyon, god, know, thanks, edu, post
Topic 5: drive, god, scsi, card, disk, hard, ide, christian, car, floppi
Topic 6: window, game, god, file, run, team, win, play, program, problem
Topic 7: key, god, game, chip, encrypt, clipper, edu, 00, use, phone
Topic 8: know, key, anyon, chip, game, encrypt, thank, clipper, think, like
Topic 9: drive, file, scsi, disk, game, peopl, israel, armenian, post, hard
Topic 10: card, video, driver, monitor, armenian, israel, peopl, vga, color, arab
Topic 11: edu, geb, pitt, n3jxp, dsl, cadre, chastiti, intellect, gordon, skeptic
Topic 12: window, car, key, god, chip, drive, edu, pleas, run, right
Topic 13: know, anyon, file, armenian, car, god, format, year, program, convert
Topic 14: file, card, think, drive

In [None]:
# Sample query
query ="formula"

# Preprocess the query
query = preprocess_text(query)

# Project the query into the LSI space
query_vector = tfidf_vectorizer.transform([query])
query_lsa = lsa.transform(query_vector)

# Compute cosine similarity between the query and documents
similarities = cosine_similarity(query_lsa, lsa_matrix)
most_similar_doc_idx = similarities.argmax()
most_similar_doc = newsgroups_data.data[most_similar_doc_idx]

print(f"Query: {query}")
print("Most similar document:")
print(most_similar_doc)


Query: formula
Most similar document:
Disclaimer -- This is for fun.

In my computerized baseball game, I keep track of a category called
"stolen hits", defined as a play made that "an average fielder would not
make with average effort."  Using the 1992 Defensive Averages posted
by Sherri Nichols (Thanks Sherri!), I've figured out some defensive stats
for the leftfielders. Hits Stolen have been redefined as "Plays Kevin
Bass would not have made."

OK, I realize that's unfair.  Kevin's probably the victim of pitching staff,
fluke shots, and a monster park factor.  But let's put it this way:  If we
replaced every leftfielder in the league with someone with Kevin's 49.4% out
making ability, how many extra hits would go by?

To try and correlate it to reality a little more, I've calculated Net
Hits Stolen, based on the number of outs made compared to what a league
average fielder would make.  By the same method I've calculated Net Extra 
Bases (doubles and triples let by).

Finally, I thro

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Load the true labels (categories) for the dataset
true_labels = newsgroups_data.target

# Perform clustering on LSI-transformed data using KMeans
kmeans = KMeans(n_clusters=20)  # Assuming 20 clusters for the 20 Newsgroups dataset
lsi_cluster_labels = kmeans.fit_predict(lsa_matrix)

# Evaluate the clustering quality
ari_score = adjusted_rand_score(true_labels, lsi_cluster_labels)
nmi_score = normalized_mutual_info_score(true_labels, lsi_cluster_labels)

print(f"Adjusted Rand Index (ARI): {ari_score}")
print(f"Normalized Mutual Information (NMI): {nmi_score}")




Adjusted Rand Index (ARI): 0.06196552113425908
Normalized Mutual Information (NMI): 0.2910276507226431
