In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
# load credentials
import os
from dotenv import load_dotenv
load_dotenv(
    "D:\\NextCountry\\src\\.env"
)

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")

## Load documents

In [30]:
# load quiz question flow
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader, Docx2txtLoader


folder_path = "D:\\NextCountry\\db"

quiz_path = "D:\\NextCountry\\db\\ignore"


quiz_loader = DirectoryLoader(
    quiz_path,
    glob="*.docx",
    loader_cls=Docx2txtLoader,
    show_progress=True
)

quiz_docs = quiz_loader.load()


pdf_loader = DirectoryLoader(
    folder_path,
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

doc_loader = DirectoryLoader(
    folder_path,
    glob="*.docx",
    loader_cls=Docx2txtLoader,
    show_progress=True
)

pdfs = pdf_loader.load()
docs = doc_loader.load()

100%|██████████| 1/1 [00:00<00:00, 13.50it/s]
100%|██████████| 3/3 [00:00<00:00, 55.54it/s]
100%|██████████| 1/1 [00:00<00:00, 28.14it/s]


In [31]:
Documents = pdfs+docs

In [32]:
from langchain.chat_models import init_chat_model

llm = init_chat_model(
    model="gemini-2.0-flash",
    model_provider="google_genai",
    temperature=0.5,   
)

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  

text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 300,
    separators=[
    "\n"]
)

splits = text_splitter.split_documents(Documents)

In [34]:
len(splits)

70

In [35]:
splits[50].metadata

{'source': 'D:\\NextCountry\\db\\UNITED KINGDOM – VISA OPTIONS AND GUIDE.docx'}

In [36]:
# Function to extract country from filename using LLM
def extract_country_from_filename(filename):
    prompt = f"""
    Extract the country name from this filename: {filename}. Then recall the full form of the country name.
    you must return the country name in full form and all capital letters, nothing else.
    example1:'USA visa details' will return 'UNITED STATES OF AMERICA'.
    example2:'BD visa details' will return 'BANGLADESH'.
    If no country is found, return 'NA'.
    """
    response = llm.invoke(prompt)
    return response.content.strip()

# update metadata
def update_metadata(splits):
    
    for i in range(len(splits)):
        filename = splits[i].metadata["source"]
        if i == 0 or filename != splits[i-1].metadata["source"]:
            country = extract_country_from_filename(filename)
        splits[i].metadata["country"] = country
        
    return splits

In [37]:
documents = update_metadata(splits)
documents[0].metadata

{'producer': 'Skia/PDF m137 Google Docs Renderer',
 'creator': '',
 'creationdate': '',
 'source': 'D:\\NextCountry\\db\\CANADA.pdf',
 'file_path': 'D:\\NextCountry\\db\\CANADA.pdf',
 'total_pages': 5,
 'format': 'PDF 1.4',
 'title': 'CANADA',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '',
 'trapped': '',
 'modDate': '',
 'creationDate': '',
 'page': 0,
 'country': 'CANADA'}

## Embedding Engine
---

In [40]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

google_embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
)


## Vector Database Connection

In [41]:
from astrapy import DataAPIClient

# Initialize the client
client = DataAPIClient(ASTRA_DB_APPLICATION_TOKEN)
db = client.get_database_by_api_endpoint(
  ASTRA_DB_API_ENDPOINT
)

print(f"Connected Database: {db.info().name}\nCollections found: {db.list_collection_names()}")

Connected Database: nc-vec-db
Collections found: ['quiz_flow']


In [42]:
# create collection
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    collection_name="visa_details",      # this collection gets created automatically
    embedding=google_embedding_model,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace="default_keyspace",
    # autodetect_collection=True,   # set to True whille using it
)

print(f"Connected to collection: {vector_store.collection_name}")

Connected to collection: visa_details


### Pushing data to collection

In [43]:
# push embedding to collection
for i in range(0, len(splits), 10):
    chunk = splits[i:i+10]
    try:
        # Add the chunk to the vector store
        vector_store.add_documents(documents=chunk)
        print(f"Chunk {i//10} added successfully")
    except Exception as e:
        print(f"Error adding chunk {i//10 }: {e}")
        continue
    

Chunk 0 added successfully
Chunk 1 added successfully
Chunk 2 added successfully
Chunk 3 added successfully
Chunk 4 added successfully
Chunk 5 added successfully
Chunk 6 added successfully


In [44]:
print(f"Embeddings loaded to: {vector_store.collection_name}")

Embeddings loaded to: visa_details


## Check Embeddings

In [45]:
# sanity check for similarity search
results = vector_store.similarity_search_with_score(
    query="1. Expand/Relocate Existing Business",
    k=5,
    filter={"country": "CANADA"}
)
for res, score in results:
    print(f" --->[Similarity score={score:3f}]\n{res.page_content}\n[{res.metadata}]\n")

 --->[Similarity score=0.782067]
●​ Work Permit Option: While awaiting permanent residence processing, applicants 
can apply for a temporary work permit to begin establishing their business in 
Canada3. This allows for a quicker entry into the Canadian market. 
Program Limitations 
It's important to note that effective until December 31, 2026, IRCC has implemented 
annual caps on applications. Each calendar year, only 10 complete group applications 
per designated organization will be considered, with processing on a first-come, 
first-served basis3. Applications submitted after a cap is reached will be returned with a 
refund of processing fees. 
Work Permit Options 
Canada offers two primary types of work permits: employer-specific (closed) work 
permits and open work permits, each designed for different circumstances and needs5. 
Employer-Specific Work Permits 
Employer-specific work permits restrict employment to a specific employer under 
conditions outlined in the permit, includi