In [None]:
#!pip install langchain-community
#!pip install faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.65 (from langchain-community)
  Downloading langchain_core-0.3.65-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.25 (from langchain-community)
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.25->langchain-community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.3.45-py3-none-any.whl.metadata (15 kB)
Collecting packaging<25,>=23.2 (from langchai

In [2]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_core.documents import Document
from cuml.cluster import KMeans
import numpy as np
import torch
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')

REPEAT = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading data...")
df = pd.concat([pd.read_csv('/kaggle/input/ag-news-classification-dataset/train.csv'), 
                pd.read_csv('/kaggle/input/ag-news-classification-dataset/test.csv')])
classmap = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Science/Technology'}
df['Class Index'] = df['Class Index'].map(classmap)

rows = df[['Title', 'Description', 'Class Index']].fillna("")

Loading data...


In [3]:
print("Initializing embedding model...")
embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", 
                                               model_kwargs={"device": device})

print("Creating augmented text chunks...")
texts = []
for _, row in tqdm(rows.iterrows(), total=len(rows), desc="Formatting text"):
    article_text = f"Title: {row['Title']}\nDescription: {row['Description']}\nCategory: {row['Class Index']}"
    augmented_text = "\n".join([article_text] * REPEAT)
    texts.append(augmented_text)

print("Generating embeddings for augmented texts...")
batch_size = 512
embeddings = []
documents = []
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding texts"):
    batch = texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    embeddings.extend(batch_embeddings)
    batch_docs = [Document(page_content=text, metadata={"index": i+j}) 
                  for j, text in enumerate(batch)]
    documents.extend(batch_docs)
embeddings = np.array(embeddings, dtype=np.float32)

print("Creating FAISS index...")
vectorstore = FAISS.from_documents(documents, embedding_model)

print("Saving FAISS index...")
vectorstore.save_local("faiss_index")

print("Embedding generation complete.")

Initializing embedding model...


2025-06-13 14:49:05.475865: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749826145.657859      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749826145.711740      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating augmented text chunks...


Formatting text: 100%|██████████| 127600/127600 [00:05<00:00, 24977.50it/s]


Generating embeddings for augmented texts...


Embedding texts: 100%|██████████| 250/250 [05:19<00:00,  1.28s/it]


Creating FAISS index...
Saving FAISS index...
Embedding generation complete.
