In [5]:
!pip install -q transformers gensim scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
from transformers import pipeline, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api

#Section A: LLM Foundations & Hugging Face

##A1. Text Generation with DistilGPT-2

In [23]:
from transformers import pipeline

# Load generator
generator = pipeline("text-generation", model="distilbert/distilgpt2")

prompt = "AI is transforming industries by"
outputs = generator(
    prompt,
    max_length=40,
    num_return_sequences=3,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    truncation=True,
    max_new_tokens=20
)

for i, out in enumerate(outputs, start=1):
    print(f"--- Generation {i} ---")
    print(out["generated_text"])
    print()


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


--- Generation 1 ---
AI is transforming industries by cutting back on the cost of production and lowering the risk of pollution by providing a way for small firms

--- Generation 2 ---
AI is transforming industries by transforming them into a global force.














--- Generation 3 ---
AI is transforming industries by creating more value for workers.”















##A2. Tokenization Demo

###A2.1 Using DistilGPT2

In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

sentence = "LLMs are powerful tools for natural language understanding."
encoded = tokenizer(sentence)

tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"])
token_ids = encoded["input_ids"]
seq_length = len(token_ids)

print("Sentence:", sentence)
print("Tokens:", tokens)
print("Token IDs:", token_ids)
print("Sequence length:", seq_length)

Sentence: LLMs are powerful tools for natural language understanding.
Tokens: ['LL', 'Ms', 'Ġare', 'Ġpowerful', 'Ġtools', 'Ġfor', 'Ġnatural', 'Ġlanguage', 'Ġunderstanding', '.']
Token IDs: [3069, 10128, 389, 3665, 4899, 329, 3288, 3303, 4547, 13]
Sequence length: 10


###A2.2 Using NLTK Tokeniser

In [25]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

sentence = "LLMs are powerful tools for natural language understanding."

# Tokenise with NLTK
tokens1 = word_tokenize(sentence)

# Since NLTK does not provide token IDs, we can map tokens to indices manually:
token_ids1 = list(range(len(tokens)))   # simple placeholder IDs for demonstration

seq_length1 = len(tokens1)

print("Sentence:", sentence)
print("Tokens:", tokens1)
print("Token IDs:", token_ids1)
print("Sequence length:", seq_length1)

Sentence: LLMs are powerful tools for natural language understanding.
Tokens: ['LLMs', 'are', 'powerful', 'tools', 'for', 'natural', 'language', 'understanding', '.']
Token IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Sequence length: 9


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Section B: Prompt Engineering
## B1. Design Prompts

In [26]:
# Defining three prompts for the three tasks
# Summarisation (≤ 30 words)
prompt_summarisation = (
    "Summarise this in no more than 30 words:\n"
    "Artificial intelligence is being adopted across healthcare, finance, and retail "
    "to automate routine tasks, support decision-making, and personalise user experiences."
    "It is also extensively being utilised by researchers and academecians to summarise the peer-peviwed papers, generate ideas and further research"
)

# Q&A
prompt_qa = (
    #"Question: What is the capital city of France?\n"
    #"Answer:"
    "Question: How do you do?\n"
    "Answer: "
)

# Creative text generation: 4-line poem on AI
prompt_poem = (
    "Write a four-line rhyming poem about artificial intelligence and humans working together:"
)

prompts = {
    "summarisation": prompt_summarisation,
    "qa": prompt_qa,
    "poem": prompt_poem
}

for name, p in prompts.items():
    print(f"--- {name.upper()} PROMPT ---")
    print(p, "\n")


--- SUMMARISATION PROMPT ---
Summarise this in no more than 30 words:
Artificial intelligence is being adopted across healthcare, finance, and retail to automate routine tasks, support decision-making, and personalise user experiences.It is also extensively being utilised by researchers and academecians to summarise the peer-peviwed papers, generate ideas and further research 

--- QA PROMPT ---
Question: How do you do?
Answer:  

--- POEM PROMPT ---
Write a four-line rhyming poem about artificial intelligence and humans working together: 



## B2. Generate Ourputs for each Prompt

In [29]:
for name, p in prompts.items():
    print(f"===== {name.upper()} OUTPUT ====")
    # Correctly apply max_length based on the prompt name using a ternary operator
    current_max_length = 40 if name == "summarisation" or name == "poem" else 60
    out = generator(p, max_length=current_max_length, do_sample=True,
                    #top_k=50,
                    #top_p=0.95,
                    truncation=True, max_new_tokens=100, num_return_sequences=1)
    print(out[0]["generated_text"])
    print("\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


===== SUMMARISATION OUTPUT ====


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Summarise this in no more than 30 words:
Artificial intelligence is being adopted across healthcare, finance, and retail to automate routine tasks, support decision-making, and personalise user experiences.It is also extensively being utilised by researchers and academecians to summarise the peer-peviwed papers, generate ideas and further research. A large number of healthcare professionals, who work in healthcare, finance, and retail, are currently working on developing artificial intelligence solutions to manage health care and patient care.

This is the second major development in a series of recent trends. These trends are all linked to a growing number of important trends and trends that are becoming increasingly important in healthcare.
The rise in the demand for artificial intelligence is likely to lead to a shift in the demand for artificial intelligence solutions. There are a few reasons


===== QA OUTPUT ====


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: How do you do?
Answer: ...........................................................
Here are some key points:
1. Don't assume that every person is a robot or a robot. There are other things you can do to make sure that you are not going to be "truly robotic."
2. Remember that when you say there are things the robot is not. Think of it as "a robot."
3. Be careful when you say there are things the robot is not.
4. Consider that there are things the robot is not


===== POEM OUTPUT ====
Write a four-line rhyming poem about artificial intelligence and humans working together: (1) "We are not here to be seen." (2) "We are not there to be seen." (3) "We are not there to be seen." (4) "We are not there to be seen." (5) "We are not there to be seen." (6) "We are not there to be seen." (7) "We are not there to be seen." (8) "I am not there to be seen." (9) "




## B3. Reflection
It appears that distilgpt2 is a small size model which restricts its ability to answer factual such as "what is the capital of France" to which it was replying "France". Furthermore, the outputs depend on token size, max_length and truncation flag. When I rephrased the prompts, I noticed that clearer and more constrained instructions produced more focused outputs. For summarisation, explicitly stating “30 words or fewer” encouraged shorter responses, although the model did not always strictly obey the limit. In Q&A, framing the input as “Question/Answer” guided the model to respond with a direct fact, whereas more open-ended phrasing sometimes led to extra explanation. For creative text generation, specifying “four-line rhyming poem” rdid not result in structures verses but a lot of reptition, likely limited due to its small size. Careful prompt engineering and hyperparamter tuning played key roles for obtaining better outputs.

# Section C: Embeddings with Gensim
## C1. Load GloVe Embeddings

In [31]:
model = api.load("glove-wiki-gigaword-50")  # 50-dimensional GloVe



## C2. Word Embeddings for Three Words

In [32]:
# Pick any three words, eg:
words = ["king", "queen", "diamond"]

for w in words:
    print(f"=== Word: {w} ===")
    vec = model[w]
    print("First 10 values of vector:\n", vec[:10])
    print("\nTop 5 most similar words:")
    for similar_word, score in model.most_similar(w, topn=5):
        print(f"{similar_word:<15} {score:.4f}")
    print()


=== Word: king ===
First 10 values of vector:
 [ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012 ]

Top 5 most similar words:
prince          0.8236
queen           0.7839
ii              0.7746
emperor         0.7736
son             0.7667

=== Word: queen ===
First 10 values of vector:
 [ 0.37854   1.8233   -1.2648   -0.1043    0.35829   0.60029  -0.17538
  0.83767  -0.056798 -0.75795 ]

Top 5 most similar words:
princess        0.8515
lady            0.8051
elizabeth       0.7873
king            0.7839
prince          0.7822

=== Word: diamond ===
First 10 values of vector:
 [-0.4958   0.78421 -0.606    1.3967   0.28888 -0.2058  -0.10745 -0.33252
  1.3608   0.15091]

Top 5 most similar words:
gold            0.7715
diamonds        0.7663
gem             0.7375
silver          0.7210
jewel           0.7102



## C3. Sentence-Level Embeddings (Simple Averaging)

In [34]:
# Create 5 sentences (e.g., all about AI/jewellery):
sentences = [
    "AI is transforming the jewellery design process.",
    "Machine learning helps detect fraud in online payments.",
    "Robots and humans collaborate in modern factories.",
    "Gemstone quality can be predicted using data models.",
    "Customer reviews are analysed by AI to improve services."
]

In [35]:
# Helper: Sentence → Average Vector
import numpy as np

def sentence_to_vec(sentence, model, dim=50):
    tokens = sentence.lower().split()
    vecs = []
    for t in tokens:
        if t in model:
            vecs.append(model[t])
    if not vecs:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

sentence_vectors = np.vstack([sentence_to_vec(s, model) for s in sentences])
sentence_vectors.shape

(5, 50)

In [36]:
# Similarity Matrix (Cosine Similarity)
sim_matrix = cosine_similarity(sentence_vectors)

print("Similarity matrix:\n")
for i, s in enumerate(sentences):
    print(f"{i}: {s}")
print()

print(np.round(sim_matrix, 3))


Similarity matrix:

0: AI is transforming the jewellery design process.
1: Machine learning helps detect fraud in online payments.
2: Robots and humans collaborate in modern factories.
3: Gemstone quality can be predicted using data models.
4: Customer reviews are analysed by AI to improve services.

[[1.    0.778 0.844 0.787 0.791]
 [0.778 1.    0.813 0.874 0.86 ]
 [0.844 0.813 1.    0.79  0.817]
 [0.787 0.874 0.79  1.    0.916]
 [0.791 0.86  0.817 0.916 1.   ]]


In [37]:
# Display with Pandas
import pandas as pd

df_sim = pd.DataFrame(
    np.round(sim_matrix, 3),
    index=[f"S{i}" for i in range(len(sentences))],
    columns=[f"S{i}" for i in range(len(sentences))]
)

df_sim


Unnamed: 0,S0,S1,S2,S3,S4
S0,1.0,0.778,0.844,0.787,0.791
S1,0.778,1.0,0.813,0.874,0.86
S2,0.844,0.813,1.0,0.79,0.817
S3,0.787,0.874,0.79,1.0,0.916
S4,0.791,0.86,0.817,0.916,1.0


# Section D: Application Exploration

##D1. Run Pipeline

In [47]:
# Sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

texts = [
    "This product has completely changed the way our team works. Performance has improved a lot.",
    "The new dashboard is confusing and has many bugs.",
    "It may be so",
    "I like it in red colour",
    "It would be better in green colour",
    "It isn't a great way to do things"
]

results = sentiment_analyzer(texts)

for t, r in zip(texts, results):
    print("Text:", t)
    print("Prediction:", r["label"], " | Score:", f"{r['score']:.3f}")
    print()

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Text: This product has completely changed the way our team works. Performance has improved a lot.
Prediction: POSITIVE  | Score: 0.999

Text: The new dashboard is confusing and has many bugs.
Prediction: NEGATIVE  | Score: 0.999

Text: It may be so
Prediction: POSITIVE  | Score: 0.863

Text: I like it in red colour
Prediction: POSITIVE  | Score: 1.000

Text: It would be better in green colour
Prediction: NEGATIVE  | Score: 0.971

Text: It isn't a great way to do things
Prediction: NEGATIVE  | Score: 1.000



## D2. Reflection on Business application


Sentiment analysis can provide businesses with real-time insight into how customers perceive their products, brand, and market presence. By automatically analysing reviews, social posts, and support interactions, companies can identify emerging issues early and prioritise product improvements based on genuine user sentiment. During go-to-market (GTM) planning, sentiment trends help teams understand audience reactions to messaging, pricing, and feature releases, enabling more informed decisions. In branding and marketing, sentiment analysis reveals which campaigns resonate, which pain points persist, and how public perception evolves over time. Overall, it turns large volumes of unstructured text into actionable intelligence that drives better products and stronger market positioning.