In [None]:
# NER with spaCy
# Setup (run in terminal once): python -m pip install spacy
# Download a small English model: python -m spacy download en_core_web_sm

import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

text = """
Apple Inc. plans a new office in Hyderabad, India. Tim Cook announced this in March 2023.
The site will create 5,000 jobs and focus on innovative technologies in supply chain analytics.
"""

doc = nlp(text)

print("Named Entities (text, label):")
for ent in doc.ents:
    print(f"{ent.text:25} -> {ent.label_}")

# Optional: group entities by label for summarization
from collections import defaultdict
by_label = defaultdict(list)
for ent in doc.ents:
    by_label[ent.label_].append(ent.text)

print("\nEntities grouped by label:")
for label, items in by_label.items():
    print(f"{label}: {sorted(set(items))}")

Named Entities (text, label):
Apple Inc.                -> ORG
Hyderabad                 -> GPE
India                     -> GPE
Tim Cook                  -> PERSON
March 2023                -> DATE
5,000                     -> CARDINAL

Entities grouped by label:
ORG: ['Apple Inc.']
GPE: ['Hyderabad', 'India']
PERSON: ['Tim Cook']
DATE: ['March 2023']
CARDINAL: ['5,000']


In [None]:
# Word embeddings with gensim Word2Vec
# Setup: python -m pip install gensim nltk

import nltk
nltk.download("punkt")
nltk.download("punkt_tab") # Added to resolve LookupError
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models import Word2Vec

corpus = """
Manufacturing relies on predictive maintenance and supply chain optimization.
Data engineers build pipelines, while analysts monitor KPIs and anomalies.
Robotics and IoT sensors stream telemetry to cloud databases for real-time insights.
Quality control uses computer vision to detect defects on the shop floor.
"""

# Tokenize sentences -> words
sentences = [word_tokenize(s.lower()) for s in sent_tokenize(corpus)]

# Train a small Word2Vec model
model = Word2Vec(
    sentences,
    vector_size=50,   # embedding dimension
    window=5,         # context window size
    min_count=1,      # keep all words for demo
    workers=2,
    sg=1              # skip-gram; use 0 for CBOW
)

# Explore similar words
for target in ["maintenance", "supply", "quality", "telemetry"]:
    print(f"\nTop similar to '{target}':")
    try:
        for w, score in model.wv.most_similar(target, topn=5):
            print(f"{w:15} -> {score:.3f}")
    except KeyError:
        print("Word not in vocabulary.")

# Cosine similarity between pairs
from numpy import dot
from numpy.linalg import norm

def cosine(u, v):
    return dot(u, v) / (norm(u) * norm(v))

pairs = [("maintenance", "telemetry"),
         ("quality", "defects"),
         ("supply", "optimization")]

print("\nCosine similarities:")
for a, b in pairs:
    try:
        sim = cosine(model.wv[a], model.wv[b])
        print(f"{a:12} ~ {b:12} -> {sim:.3f}")
    except KeyError:
        print(f"Missing word: {a} or {b}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Top similar to 'maintenance':
real-time       -> 0.240
defects         -> 0.240
analysts        -> 0.236
kpis            -> 0.224
detect          -> 0.204

Top similar to 'supply':
optimization    -> 0.286
to              -> 0.252
cloud           -> 0.192
analysts        -> 0.169
maintenance     -> 0.169

Top similar to 'quality':
shop            -> 0.307
cloud           -> 0.267
uses            -> 0.191
floor           -> 0.154
.               -> 0.151

Top similar to 'telemetry':
predictive      -> 0.171
cloud           -> 0.171
manufacturing   -> 0.167
iot             -> 0.166
for             -> 0.164

Cosine similarities:
maintenance  ~ telemetry    -> -0.145
quality      ~ defects      -> 0.119
supply       ~ optimization -> 0.286


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
# Code Demo (Trigram counts)
from collections import defaultdict

corpus = "the cat sat on the mat the cat lay on the rug the dog barked loudly"
tokens = corpus.split()

# Build trigram counts
trigram_counts = defaultdict(lambda: defaultdict(int))
for i in range(len(tokens)-2):
    context = (tokens[i], tokens[i+1])
    next_word = tokens[i+2]
    trigram_counts[context][next_word] += 1

# Predict next word after "the cat"
context = ("the", "cat")
print("Next word predictions for context:", context)
for word, count in trigram_counts[context].items():
    print(f"{word} -> {count}")

Next word predictions for context: ('the', 'cat')
sat -> 1
lay -> 1


In [None]:
# Code Demo (Simple feedforward LM with Keras)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Vocabulary size and embedding dimension
vocab_size = 50
embed_dim = 8

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=2), # context length=2
    Flatten(),
    Dense(32, activation='relu'),
    Dense(vocab_size, activation='softmax')  # predict next word
])

model.compile(optimizer='adam', loss='categorical_crossentropy')
model.build(input_shape=(None, 2))  # batch size flexible, sequence length = 2

model.summary()

In [None]:
#Code Demo (LSTM for next‑word prediction)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Vocabulary size and embedding dimension
vocab_size = 100
embed_dim = 16
seq_length = 5  # input sequence length

# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=seq_length),
    LSTM(64),  # LSTM layer with 64 units
    Dense(vocab_size, activation='softmax')  # predict next word
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Force the model to build by providing input shape
model.build(input_shape=(None, seq_length))

# Show the summary
model.summary()

# Optional: run a dummy prediction to confirm
dummy_input = np.random.randint(0, vocab_size, (1, seq_length))
print("Dummy prediction shape:", model.predict(dummy_input).shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step
Dummy prediction shape: (1, 100)
