In [1]:
import pandas as pd

# Load your CSV
df = pd.read_csv("merged_karnataka.csv")

# Function to check if a string has only ASCII characters
def is_ascii(s):
    try:
        s.encode('ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

# Check across all columns in a row
def row_is_ascii(row):
    return all(is_ascii(str(val)) for val in row)

# Split the dataset
ascii_df = df[df.apply(row_is_ascii, axis=1)]
non_ascii_df = df[~df.apply(row_is_ascii, axis=1)]

# Save outputs
ascii_df.to_csv("ascii_only.csv", index=False)
non_ascii_df.to_csv("non_ascii.csv", index=False)

print("✅ Done! Files saved as ascii_only.csv and non_ascii.csv")

✅ Done! Files saved as ascii_only.csv and non_ascii.csv


In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# 1. Load your CSV
df = pd.read_csv("ascii_only.csv")

# 2. Extract QA pairs
qa_pairs = df[["QueryText", "KccAns"]].dropna()

# 3. Combine Q + A into a single text for embeddings
qa_pairs["combined"] = qa_pairs["QueryText"] + " " + qa_pairs["KccAns"]

# 4. Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 5. Encode combined text
embeddings = model.encode(qa_pairs["combined"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# 6. Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# 7. Save FAISS + metadata
faiss.write_index(index, "qa_index.faiss")
with open("qa_metadata.pkl", "wb") as f:
    pickle.dump(qa_pairs.to_dict(orient="records"), f)

print("✅ FAISS index created with both Q & A stored")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2467 [00:00<?, ?it/s]

✅ FAISS index created with both Q & A stored
