In [1]:
%pip install -q transformers sentence-transformers chromadb
%pip install -q numpy pandas scikit-learn
%pip install -q kaggle

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# IMPORTS

In [2]:
import numpy as np
import pandas as pd
import os
import zipfile
import chromadb

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ENV VARS

In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "."
os.environ['TOKENIZERS_PARALLELISM'] = "false"

# HYPERPARAMS

In [4]:
MAX_NEWS = 1000
DOCUMENT = 'title'
TOPIC = 'topic'

In [5]:
!kaggle datasets download -d 'kotartemiy/topic-labeled-news-dataset'

file_path = 'topic-labeled-news-dataset.zip'
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('./')

Dataset URL: https://www.kaggle.com/datasets/kotartemiy/topic-labeled-news-dataset
License(s): CC0-1.0
topic-labeled-news-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
news = pd.read_csv('./labelled_newscatcher_dataset.csv', sep=';')
subset_news = news.sample(n=MAX_NEWS, random_state=42)

In [7]:
subset_news.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
27583,SCIENCE,https://thedigitalwise.com/2020/08/13/nasa-res...,thedigitalwise.com,2020-08-13 13:25:00,NASA Researchers Finds A Dwarf Planet Ceres Wi...,en
94783,WORLD,https://www.theguardian.com/world/live/2020/au...,theguardian.com,2020-08-16 23:13:00,Amazon investigated in Germany over lockdown p...,en
101314,SPORTS,https://madaboutepl.net/2020/08/liverpool-tran...,madaboutepl.net,2020-08-05 17:39:05,Liverpool Asked To Pay At Least £25m For Wing ...,en
15627,ENTERTAINMENT,https://pledgetimes.com/neha-dhupia-gave-a-rob...,pledgetimes.com,2020-08-09 09:46:16,Neha Dhupia gave a robust reply to Suchitra Kr...,en
2888,SCIENCE,https://en.brinkwire.com/news/spacex-brings-na...,en.brinkwire.com,2020-08-04 17:15:37,SpaceX brings NASA astronauts safely home in m...,en


In [8]:
chroma_client = chromadb.PersistentClient(path='./db/')
collection_name = "news_collection"

if(len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]):
    chroma_client.delete_collection(collection_name)

collection = chroma_client.create_collection(collection_name)

In [9]:
collection.add(
    documents=subset_news[DOCUMENT].tolist(),
    metadatas=[{TOPIC: topic} for topic in subset_news[TOPIC].tolist()],
    ids=[f"id{x}" for x in range(len(subset_news))]
)

In [10]:
results = collection.query(query_texts=["laptop"], n_results=5)
print(results)

{'ids': [['id384', 'id912', 'id9', 'id899', 'id988']], 'embeddings': None, 'documents': [['The best back to school laptop sales: top deals from Apple, Best Buy, Dell and more', "Dell's latest business Chromebook offers high-end specs and extra security", 'Redmi G gaming laptop unveiled: 144Hz display for just $760 news', 'Apple just made it a whole lot easier to get your MacBook fixed', "New Microsoft Surface accessories 'Brydge' the gap between tablet and laptop"]], 'uris': None, 'data': None, 'metadatas': [[{'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}]], 'distances': [[1.0497058629989624, 1.2200576066970825, 1.2559947967529297, 1.3180276155471802, 1.3815689086914062]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [11]:
model_id = 'meta-llama/Llama-3.2-3B'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    device_map="auto"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps:0


In [12]:
question = "Can I buy a new Toshiba laptop?"
context = " ".join([f"#{str(i)}" for i in results["documents"][0]])
prompt_template = f"""
Relevant context: {context}
Considering the relevant context, answer the question.
Question: {question}
Answer: """
prompt_template

"\nRelevant context: #The best back to school laptop sales: top deals from Apple, Best Buy, Dell and more #Dell's latest business Chromebook offers high-end specs and extra security #Redmi G gaming laptop unveiled: 144Hz display for just $760 news #Apple just made it a whole lot easier to get your MacBook fixed #New Microsoft Surface accessories 'Brydge' the gap between tablet and laptop\nConsidering the relevant context, answer the question.\nQuestion: Can I buy a new Toshiba laptop?\nAnswer: "

In [13]:
lm_response = pipe(prompt_template, return_full_text=False)
lm_response

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': "1. Yes, you can buy a new Toshiba laptop. 2. No, you cannot buy a new Toshiba laptop. 3. It depends on the context. 4. I don't know. 5. I don't think so.\n"}]