**1. Perform a full NLP pipeline on a text dataset (Dataset: 20_Newsgroups )**



In [5]:
!pip -q install nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings, re, random, os
warnings.filterwarnings("ignore")
random.seed(42); np.random.seed(42)
import re
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as SkLDA, PCA
from sklearn.manifold import TSNE
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('stopwords'); nltk.download('vader_lexicon')
!pip install gensim scikit-learn matplotlib pandas nltk

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [6]:
### Import the dataset
from sklearn.datasets import fetch_20newsgroups
### check all the categories in the dataset
cats = fetch_20newsgroups(subset='train').target_names  # or subset='all'
print(f"{len(cats)} categories:")
for i in cats:
    print(i)

20 categories:
alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc


In [8]:
# Load dataset (subset of categories)
categories = ['rec.sport.baseball', 'sci.space', 'rec.motorcycles', 'sci.crypt']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Convert to DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})
df['category'] = df['label'].apply(lambda x: newsgroups.target_names[x]) #category name

# newsgroups.data → list of raw texts (strings). newsgroups.target → numeric labels (0, 1, 2, 3).
# newsgroups.target_names → list of category names in order.

### **Preprocessing the Files**

**Carry out text preprocessing (list all the preprocessing tasks performed).**

In [9]:
from nltk.corpus import wordnet #WordNet is used to support lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger_eng')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
import re #regular expression

# Define ALPHA_RE
ALPHA_RE = re.compile(r'[^a-zA-Z]')

# Map NLTK POS tag to WordNet POS Part of Speech, grammatical category of a word in a sentence — basically, what role the word is playing
def _to_wn_pos(tag: str):
    t = tag[0].upper() if tag else 'N'
    return {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}.get(t, wordnet.NOUN)

# returns list of lemmas (tokens)
def preprocess_to_tokens(text: str):
    text = ALPHA_RE.sub(' ', (text or '')).lower() ## keep only lowercase and english alphabets only areplace with space every other entity
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    pos_tags = nltk.pos_tag(tokens)
    return [lemmatizer.lemmatize(w, _to_wn_pos(pos)) for w, pos in pos_tags]

df['tokens'] = df['text'].apply(preprocess_to_tokens)
#joining list of tokens back into a single cleaned string, so that can be fed to TF_IDF
df['clean_text'] = df['tokens'].apply(lambda xs: ' '.join(xs))
df.head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Unnamed: 0,text,label,category,tokens,clean_text
0,\n\n ...,3,sci.space,"[jeremy, talk, single, batse, component, whole...",jeremy talk single batse component whole thing...
1,\n\nDC-X as is today isn't suitable for this. ...,3,sci.space,"[dc, x, today, suitable, however, followon, sd...",dc x today suitable however followon sdio fund...
2,"\nIf you do make it into New York state, the P...",0,rec.motorcycles,"[make, new, york, state, palisade, interstate,...",make new york state palisade interstate parkwa...
3,\nThis is a very curious thing to say. STU-III...,2,sci.crypt,"[curious, thing, say, stu, iii, nsa, design, s...",curious thing say stu iii nsa design secure te...
4,Hi.\n\nI'm not sure what the other guy (can't ...,0,rec.motorcycles,"[hi, sure, guy, track, post, name, talk, make,...",hi sure guy track post name talk make claim co...


**Extract top keywords using TF-IDF scores**
1. Top keywords per category
2. Top 10 keywords in the whole corpus

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
## Fit and transform the clean_text Data
X_tfidf = vectorizer.fit_transform(df['clean_text'])
##Feature names
feature_names = vectorizer.get_feature_names_out()
print("Shape of TF-IDF: ", X_tfidf.shape)
### Top keywords per category
for cat in sorted(df['category'].unique()):
    mask = (df['category'] == cat).values
    mean_cat = X_tfidf[mask].mean(axis=0).A1
    idx = np.argsort(mean_cat)[::-1][:15]
    print(f"\nTop terms for [{cat}]:")
    for t, s in zip(feature_names[idx], mean_cat[idx]):
        print(f"{t:>22s}  {s:.4f}")
    print("\n")

##Top 10 keywords for the whole corpus (not per category)
top_n = 10
mean_scores = X_tfidf.mean(axis=0).A1  #average across docs
top_idx = np.argsort(mean_scores)[::-1][:top_n]
top_overall = pd.DataFrame({
    "term": feature_names[top_idx],
    "mean_tfidf": mean_scores[top_idx]
})
print("Top 10 keywords in the whole corpus", top_overall.to_string(index=True))


Shape of TF-IDF:  (3968, 5000)

Top terms for [rec.motorcycles]:
                  bike  0.0584
                  ride  0.0225
            motorcycle  0.0220
                   dod  0.0219
                  like  0.0206
                   rid  0.0170
                   dog  0.0170
                 drive  0.0168
                  make  0.0165
                 think  0.0164
                  know  0.0160
                   say  0.0141
                helmet  0.0140
                  look  0.0140
                   buy  0.0138



Top terms for [rec.sport.baseball]:
                  game  0.0515
                  year  0.0348
                  team  0.0335
                   hit  0.0271
                player  0.0268
              baseball  0.0260
                 pitch  0.0250
                 think  0.0237
                   run  0.0228
               pitcher  0.0209
                  play  0.0205
                   win  0.0196
                  good  0.0183
                   say  0.01

**Perform named entity recognition (NER, Using spacy) to identify entities**

Importing spacy and and a small english model

In [11]:
# Upgrade packaging basics FIRST (you removed setuptools)
!pip -q install -U pip setuptools wheel

# Install spaCy + a compatible typer to avoid gradio warnings
!pip -q install -U "spacy==3.7.2" "typer>=0.12,<1.0"

# Download the small English model
!python -m spacy download en_core_web_sm


[31mERROR: Cannot install spacy==3.7.2 and typer<1.0 and >=0.12 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0mCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.5 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


smoke Test to check if its working fine or not?

In [12]:
# Smoke test
import spacy
nlp_ner = spacy.load("en_core_web_sm")
doc = nlp_ner("NASA launched the Hubble Space Telescope in April 1990.")
[(ent.text, ent.label_) for ent in doc.ents]

[('NASA', 'ORG'),
 ('the Hubble Space Telescope', 'ORG'),
 ('April 1990', 'DATE')]

spaCy’s Named Entity Recognition (NER) to pull out entities (like people, places, dates, organizations) from text. It then previews the results for the first four documents in your dataset, showing each document’s category and its detected

In [14]:
def extract_entities(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    doc = nlp_ner(text)
    return [(ent.text, ent.label_) for ent in doc.ents]
# Preview on a 4 docs
for i in range(4):
    ents = extract_entities(df.loc[i, 'text'])
    print(f"\nDoc {i} | Category: {df.loc[i,'category']}")
    print(ents[:12])


Doc 0 | Category: sci.space
[('BATSE', 'ORG'), ('BATSE', 'ORG'), ('two', 'CARDINAL'), ('three', 'CARDINAL')]

Doc 1 | Category: sci.space
[('today', 'DATE'), ('I.', 'ORG'), ('bush', 'PERSON'), ('Allen\n', 'PERSON')]

Doc 2 | Category: rec.motorcycles
[('New York', 'GPE'), ('the Palisades Interstate Parkway', 'ORG'), ('Seven', 'CARDINAL'), ('Lakes Drive', 'ORG')]

Doc 3 | Category: sci.crypt
[('STU', 'ORG'), ('FBI', 'ORG'), ('several hundred\nthousand', 'CARDINAL'), ('US', 'GPE'), ('DoJ', 'ORG'), ('STU', 'ORG'), ('STU', 'ORG')]


In [15]:
df['entities'] = df['text'].apply(extract_entities)
# 2) Build table
rows = []
for i, (cat, ents) in enumerate(zip(df.get('category', pd.Series([None]*len(df))), df['entities'])):
    for txt, lab in ents:
        rows.append({"doc_id": i, "category": cat, "entity": txt, "label": lab})
ents_df = pd.DataFrame(rows)
print("Entities table:", ents_df.shape)

Entities table: (51813, 4)


In [16]:
ents_df['label'].value_counts().to_frame('count')
top_per_label = (
    ents_df.groupby(['label','entity'])
           .size().reset_index(name='count')
           .sort_values(['label','count'], ascending=[True, False])
)

for lbl in top_per_label['label'].unique():
    print(f"\nTop entities for label = {lbl}")
    print(top_per_label[top_per_label['label'] == lbl].head(10))



Top entities for label = CARDINAL
         label entity  count
2721  CARDINAL    one    679
608   CARDINAL      1    657
2805  CARDINAL    two    633
1188  CARDINAL      2    548
1476  CARDINAL      3    439
67    CARDINAL      0    356
1669  CARDINAL      4    301
1843  CARDINAL      5    253
2796  CARDINAL  three    183
2437  CARDINAL    One    168

Top entities for label = DATE
     label     entity  count
5602  DATE      today    154
5057  DATE  last year    116
3663  DATE       1993    103
5579  DATE  this year     98
3662  DATE       1992     96
3643  DATE       1988     71
3651  DATE       1990     69
3655  DATE       1991     53
5667  DATE  yesterday     46
5662  DATE      years     43

Top entities for label = EVENT
      label                 entity  count
5708  EVENT                   WWII     13
5701  EVENT                 Series     10
5741  EVENT       the World Series     10
5715  EVENT           World Series      6
5677  EVENT              Civil War      3
5695  EVENT 

In [17]:
if 'category' in df.columns:
    top_per_cat = (
        ents_df.groupby(['category','entity'])
               .size().reset_index(name='count')
               .sort_values(['category','count'], ascending=[True, False])
    )
    for cat in sorted(df['category'].dropna().unique()):
        print(f"\nTop entities in category = {cat}")
        print(top_per_cat[top_per_cat['category'] == cat].head(10))



Top entities in category = rec.motorcycles
             category  entity  count
2562  rec.motorcycles   first    117
2664  rec.motorcycles     one    116
2883  rec.motorcycles     two     95
721   rec.motorcycles     BMW     86
226   rec.motorcycles       2     60
1314  rec.motorcycles   Honda     51
59    rec.motorcycles       1     44
1775  rec.motorcycles     One     31
2710  rec.motorcycles  second     31
309   rec.motorcycles       3     28

Top entities in category = rec.sport.baseball
                category     entity  count
3574  rec.sport.baseball          1    371
3045  rec.sport.baseball          0    331
3936  rec.sport.baseball          2    242
4110  rec.sport.baseball          3    240
8027  rec.sport.baseball      first    209
4251  rec.sport.baseball          4    182
4351  rec.sport.baseball          5    168
8595  rec.sport.baseball        two    144
8179  rec.sport.baseball        one    116
8105  rec.sport.baseball  last year    106

Top entities in category = s

**Represent text with pretrained embeddings (e.g., Word2Vec, GloVe).**

In [None]:
import gensim.downloader as api

glove = api.load("glove-wiki-gigaword-100") # Glove, 128 MB
google = api.load("word2vec-google-news-300") # Word2Vec, 1.16 GB huge data

print("GloVe dims/vocab:", glove.vector_size, len(glove.key_to_index))
print("GoogleNews dims/vocab:", google.vector_size, len(google.key_to_index))


In [None]:
import pprint
info = api.info()
pprint.pp(info['models']['glove-wiki-gigaword-100'])
pprint.pp(info['models']['word2vec-google-news-300'])
print(glove['the'].shape)    # (100,)
print(google['the'].shape)   # (300,)


In [None]:
wv = glove        # wv = google

print("Using:", "GloVe" if wv is glove else "GoogleNews",
      "| dims:", wv.vector_size, "| vocab:", len(wv.key_to_index))

In [None]:
emb_dim = wv.vector_size
E = np.zeros((len(feature_names), emb_dim), dtype=np.float32)
oov = 0
for i, term in enumerate(feature_names):
    if term in wv.key_to_index:
        E[i] = wv[term]
    else:
        oov += 1
print(f"OOV terms: {oov}/{len(feature_names)}")

# 4. Document embeddings: TF-IDF @ Embedding matrix
doc_emb = X_tfidf @ E
doc_emb = doc_emb / (np.linalg.norm(doc_emb, axis=1, keepdims=True) + 1e-12)  # normalize
print("Document embeddings shape:", doc_emb.shape)  # (n_docs, emb_dim)

**Display the TF-IDF matrix (or a sample) and visualize the word embeddings in 2D space (eg: t-SNE)**

In [None]:
import pandas as pd

rows = min(10, X_tfidf.shape[0])
cols = min(15, len(feature_names))
tfidf_sample = pd.DataFrame(
    X_tfidf[:rows, :cols].toarray(),
    columns=feature_names[:cols]
).round(3)

print("TF-IDF shape:", X_tfidf.shape)
tfidf_sample  # Colab will render this as a table


In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# choose up to 300 TF-IDF vocab terms that exist in the pretrained model
viz_words = [w for w in feature_names if w in wv.key_to_index][:300]
W = np.vstack([wv[w] for w in viz_words])  # (n_words, emb_dim)

tsne = TSNE(n_components=2, perplexity=30, random_state=42, init="pca", learning_rate="auto")
coords = tsne.fit_transform(W)

tsne_df = pd.DataFrame(coords, columns=["x","y"]); tsne_df["word"] = viz_words
print(tsne_df.head())

plt.figure(figsize=(10,8))
plt.scatter(tsne_df.x, tsne_df.y, s=10)
# annotate a light subset to avoid clutter
for i in range(0, len(tsne_df), max(1, len(tsne_df)//50)):
    r = tsne_df.iloc[i]; plt.annotate(r.word, (r.x, r.y), fontsize=8, alpha=0.8)
plt.title("t-SNE of Pretrained Word Embeddings (GloVe)")
plt.show()


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

sub = min(800, doc_emb.shape[0])
coords = TSNE(n_components=2, perplexity=30, random_state=42, init="pca", learning_rate="auto") \
         .fit_transform(doc_emb[:sub])

plt.figure(figsize=(8,6))
plt.scatter(coords[:,0], coords[:,1], s=6)
plt.title("t-SNE of Document Embeddings (TF-IDF-weighted GloVe)")
plt.show()


**Perform sentiment analysis (positive/negative/neutral), topic modeling using LDA, extractive summarization of long reviews**

## Perform topic modeling using lda

### Subtask:
Apply Latent Dirichlet Allocation (LDA) to the TF-IDF matrix to identify the main topics within the documents.


**Reasoning**:
Apply Latent Dirichlet Allocation (LDA) to the TF-IDF matrix to identify the main topics within the documents.



applied the get_sentiment function to your DataFrame's 'text' column, categorized each text's sentiment, added the results to a 'sentiment' column, and printed the sentiment counts.

In [36]:
import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    c = sia.polarity_scores(str(text))['compound']
    return "positive" if c >= 0.05 else ("negative" if c <= -0.05 else "neutral")

df['sentiment'] = df['text'].apply(get_sentiment)
print("Sentiment distribution:\n", df['sentiment'].value_counts(), "\n")

Sentiment distribution:
 sentiment
positive    2412
negative    1032
neutral      524
Name: count, dtype: int64 



 Topic modeling using LDA

for 4 categories

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

cv = CountVectorizer(max_features=6000, stop_words='english', min_df=5, ngram_range=(1,1))
X_counts = cv.fit_transform(df['clean_text'])
terms = cv.get_feature_names_out()

lda = LatentDirichletAllocation(n_components=4, learning_method='batch', random_state=42)
lda.fit(X_counts)

# Top words per topic
topn = 12
for k, comp in enumerate(lda.components_):
    idx = np.argsort(comp)[::-1][:topn]
    print(f"Topic {k}: " + ", ".join(terms[i] for i in idx))

# Doc → topic mixture + dominant topic
doc_topic = lda.transform(X_counts)                 # (n_docs, 4)
df['topic_id'] = doc_topic.argmax(axis=1)


Topic 0: year, game, db, think, team, hit, good, run, time, player, like, make
Topic 1: use, mail, key, post, message, list, edu, know, like, anonymous, com, public
Topic 2: space, launch, nasa, orbit, mission, satellite, use, earth, data, program, shuttle, information
Topic 3: key, use, chip, government, make, bike, like, know, people, think, encryption, right


extractive summarization of long reviews


In [34]:
!pip -q install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.tokenizers import Tokenizer

def summarize_lexrank(text, n_sent=3):
    if not isinstance(text, str) or not text.strip():
        return ""
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summary = LexRankSummarizer()(parser.document, n_sent)
    return " ".join(str(s) for s in summary)

# Apply to long docs only (optional filter)
df['summary'] = df['text'].apply(lambda t: summarize_lexrank(t, n_sent=3))

# Preview
df[['sentiment','topic_id','summary']].head()


Unnamed: 0,sentiment,topic_id,summary
0,positive,2,-jeremy Are you talking about a single BATSE c...
1,positive,2,DC-X as is today isn't suitable for this. Howe...
2,positive,3,"If you do make it into New York state, the Pal..."
3,positive,3,This is a very curious thing to say. STU-IIIs ...
4,positive,3,"Also, (and this is not applicable to hard-core..."
