In [101]:
import warnings
warnings.filterwarnings("ignore")

In [102]:
# load credentials
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")

## Load documents

In [103]:
import os
from langchain_unstructured import UnstructuredLoader

folder_path = "D:\\NextCountry\\db"
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.pdf', '.docx', '.doc', '.txt'))]

print("Found files:", len(file_paths))
for file in file_paths:
    print(f"Processing {os.path.basename(file)}...")

loader = UnstructuredLoader(file_paths,
                            partition_via_api=False,
                            split_pdf_pages=True)

print("\nLoading documents...")
docs = loader.load()
print(f"✅ Total documents loaded: {len(docs)}")

print("\nSample content from first document:")
import pprint
pprint.pp(docs[0].page_content), pprint.pp(docs[0].metadata)

Found files: 4
Processing CANADA.pdf...
Processing UAE visa and relocation programs_ - Google Docs.pdf...
Processing UNITED KINGDOM – VISA OPTIONS AND GUIDE.docx...
Processing USA.pdf...

Loading documents...
✅ Total documents loaded: 743

Sample content from first document:
'CANADA'
{'source': 'D:\\NextCountry\\db\\CANADA.pdf',
 'coordinates': {'points': ((264.844875, 89.39440880345694),
                            (264.844875, 108.39440930345688),
                            (352.433940458916, 108.39440930345688),
                            (352.433940458916, 89.39440880345694)),
                 'system': 'PixelSpace',
                 'layout_width': 612.0,
                 'layout_height': 792.0},
 'file_directory': 'D:\\NextCountry\\db',
 'filename': 'CANADA.pdf',
 'languages': ['eng'],
 'last_modified': '2025-05-10T14:00:52',
 'page_number': 1,
 'filetype': 'application/pdf',
 'category': 'Title',
 'element_id': '1cb2764cd029c4b05e748ceda6a613ee'}


(None, None)

In [104]:
from langchain.chat_models import init_chat_model
llm = init_chat_model(
    model="gemini-2.0-flash",
    model_provider="google_genai",
    temperature=0.5
)

In [105]:
# Function to extract country from filename using LLM
def extract_country_from_filename(filename):
    prompt = f"""
    Extract the country name from this filename: {filename}. Then recall the full form of the country name.
    you must return the country name in full form and all capital letters, nothing else.
    example1:'USA visa details' will return 'UNITED STATES OF AMERICA'.
    example2:'BD visa details' will return 'BANGLADESH'.
    If no country is found, return 'NA'.
    """
    response = llm.invoke(prompt)
    return response.content.strip()

# update metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter  
def update_metadata(splits):
    # text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size = 1000,
    # chunk_overlap = 300,
    #     separators=[
    #     "\n"]
    # )

    # splits = text_splitter.split_documents(documents)
    
    for i in range(len(splits)):
        filename = splits[i].metadata["filename"]
        if i == 0 or filename != splits[i-1].metadata["filename"]:
            country = extract_country_from_filename(filename)
        splits[i].metadata["country"] = country
        
    return splits

In [106]:
documents = update_metadata(docs)
documents[0].metadata

{'source': 'D:\\NextCountry\\db\\CANADA.pdf',
 'coordinates': {'points': ((264.844875, 89.39440880345694),
   (264.844875, 108.39440930345688),
   (352.433940458916, 108.39440930345688),
   (352.433940458916, 89.39440880345694)),
  'system': 'PixelSpace',
  'layout_width': 612.0,
  'layout_height': 792.0},
 'file_directory': 'D:\\NextCountry\\db',
 'filename': 'CANADA.pdf',
 'languages': ['eng'],
 'last_modified': '2025-05-10T14:00:52',
 'page_number': 1,
 'filetype': 'application/pdf',
 'category': 'Title',
 'element_id': '1cb2764cd029c4b05e748ceda6a613ee',
 'country': 'CANADA'}

In [107]:
len(documents)
# documents[100].metadata

743

## Embedding Engine
---

In [108]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

google_embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
)


## Vector Database Connection

In [109]:
from astrapy import DataAPIClient

# Initialize the client
client = DataAPIClient(ASTRA_DB_APPLICATION_TOKEN)
db = client.get_database_by_api_endpoint(
  ASTRA_DB_API_ENDPOINT
)

print(f"Connected Database: {db.info().name}\nCollections found: {db.list_collection_names()}")

INFO: getting database info
INFO: HTTP Request: GET https://api.astra.datastax.com/v2/databases/d21836b8-de1d-4618-962c-f8dd6c98eea5 "HTTP/1.1 200 OK"
INFO: finished getting database info
INFO: findCollections
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace "HTTP/1.1 200 OK"
INFO: finished findCollections


Connected Database: next_country
Collections found: ['visa_information', 'visa_information_dev']


In [110]:
# create collection
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    collection_name="visa_information_v3",      # this collection gets created automatically
    embedding=google_embedding_model,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace="default_keyspace",
    # autodetect_collection=True,   # set to True whille using it
)

print(f"Connected to collection: {vector_store.collection_name}")

INFO: vector store default init, collection 'visa_information_v3'
INFO: Detecting API environment 'prod' from supplied endpoint
INFO: createCollection('visa_information_v3')
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace "HTTP/1.1 200 OK"
INFO: finished createCollection('visa_information_v3')


Connected to collection: visa_information_v3


### Pushing data to collection

In [111]:
# push embedding to collection
for i in range(0, len(documents), 100):
    chunk = documents[i:i+100]
    try:
        # Add the chunk to the vector store
        vector_store.add_documents(documents=chunk)
        print(f"Chunk {i//100} added successfully")
    except Exception as e:
        print(f"Error adding chunk {i//100 }: {e}")
        continue
    

INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 0 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 1 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 2 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 3 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 4 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 5 added successfully


INFO: inserting 100 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 100 documents in 'visa_information_v3'


Chunk 6 added successfully


INFO: inserting 43 documents in 'visa_information_v3'
INFO: insertMany(chunk) on 'visa_information_v3'
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: finished insertMany(chunk) on 'visa_information_v3'
INFO: finished inserting 43 documents in 'visa_information_v3'


Chunk 7 added successfully


In [112]:
print(f"Embeddings loaded to: {vector_store.collection_name}")

Embeddings loaded to: visa_information_v3


## Check Embeddings

In [113]:
# as data is already loaded

vector_store = AstraDBVectorStore(
    embedding = google_embedding_model,
    collection_name="visa_information_v3",
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    autodetect_collection=True,
)

INFO: vector store autodetect init, collection 'visa_information_v3'
INFO: Attempting to fetch keyspace from environment variable 'ASTRA_DB_KEYSPACE'
INFO: Detecting API environment 'prod' from supplied endpoint
INFO: findCollections
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace "HTTP/1.1 200 OK"
INFO: finished findCollections
INFO: cursor fetching a page: (empty page state) from visa_information_v3
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: cursor finished fetching a page: (empty page state) from visa_information_v3
INFO: vector store autodetect: has_vectorize = False
INFO: vector store autodetect: inspecting 15 documents
INFO: vector store autodetect: is_flat = False
INFO: vector store autodetect: inferring content_field from 15 documents
INFO: vector store autodetec

In [116]:
# sanity check for similarity search
results = vector_store.similarity_search_with_score(
    query="US tourist Visa",
    k=5,
    filter={"country": "CANADA"}
)
for res, score in results:
    print(f" --->[Similarity score={score:3f}]\n{res.page_content}\n[{res.metadata}]\n")

INFO: cursor fetching a page: (empty page state) from visa_information_v3
INFO: HTTP Request: POST https://d21836b8-de1d-4618-962c-f8dd6c98eea5-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace/visa_information_v3 "HTTP/1.1 200 OK"
INFO: cursor finished fetching a page: (empty page state) from visa_information_v3


 --->[Similarity score=-7.964844]
support, processing typically takes 3-6 months2.
[{'source': 'D:\\NextCountry\\db\\CANADA.pdf', 'coordinates': {'points': [[108, 392.28601131], [108, 404.95983939], [368.92968756, 404.95983939], [368.92968756, 392.28601131]], 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'D:\\NextCountry\\db', 'filename': 'CANADA.pdf', 'languages': ['eng'], 'last_modified': '2025-05-10T14:00:52', 'page_number': 2, 'parent_id': '0e03826a6727f53c007f317ab93efcb0', 'filetype': 'application/pdf', 'category': 'NarrativeText', 'element_id': '3e1516238431d5bafd993b505aa94e6e', 'country': 'CANADA'}]

 --->[Similarity score=-7.964844]
and compile essential documents2.
[{'source': 'D:\\NextCountry\\db\\CANADA.pdf', 'coordinates': {'points': [[108, 308.08435131], [108, 320.75817939], [300.94922256, 320.75817939], [300.94922256, 308.08435131]], 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'D:\\NextCou