In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load credentials
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")

## Load documents

In [3]:
# load quiz question flow
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader, Docx2txtLoader


folder_path = "D:\\NextCountry_personal\\db"

# quiz_path = "D:\\NextCountry_personal\\db\\ignore"


# quiz_loader = DirectoryLoader(
#     quiz_path,
#     glob="*.docx",
#     loader_cls=Docx2txtLoader,
#     show_progress=True
# )

# quiz_docs = quiz_loader.load()


pdf_loader = DirectoryLoader(
    folder_path,
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

doc_loader = DirectoryLoader(
    folder_path,
    glob="*.docx",
    loader_cls=Docx2txtLoader,
    show_progress=True
)

pdfs = pdf_loader.load()
docs = doc_loader.load()

100%|██████████| 6/6 [00:00<00:00, 29.86it/s]
100%|██████████| 2/2 [00:00<00:00, 29.66it/s]


In [4]:
Documents = pdfs+docs

In [6]:
from langchain.chat_models import init_chat_model

llm = init_chat_model(
    model="gemini-2.0-flash",
    model_provider="google_genai",
    temperature=0.5,   
)

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  

text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 300,
    separators=[
    "\n"]
)

splits = text_splitter.split_documents(Documents)

In [8]:
len(splits)

231

In [9]:
splits[50].metadata

{'producer': 'Skia/PDF m139 Google Docs Renderer',
 'creator': '',
 'creationdate': '',
 'source': 'D:\\NextCountry_personal\\db\\Master Document  - STARTUP VISA PROGRAMS (GROUP 1 COUNTRIES).pdf',
 'file_path': 'D:\\NextCountry_personal\\db\\Master Document  - STARTUP VISA PROGRAMS (GROUP 1 COUNTRIES).pdf',
 'total_pages': 24,
 'format': 'PDF 1.4',
 'title': 'Practioner Notes For LLM',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '',
 'trapped': '',
 'modDate': '',
 'creationDate': '',
 'page': 10}

In [10]:
# Function to extract country from filename using LLM
def extract_country_from_filename(filename):
    prompt = f"""
    Extract the country name from this filename: {filename}. Then recall the full form of the country name.
    you must return the country name in full form and all capital letters, nothing else.
    example1:'USA visa details' will return 'UNITED STATES OF AMERICA'.
    example2:'BD visa details' will return 'BANGLADESH'.
    If no country is found, return 'NA'.
    """
    response = llm.invoke(prompt)
    return response.content.strip()

# update metadata
def update_metadata(splits):
    
    for i in range(len(splits)):
        filename = splits[i].metadata["source"]
        if i == 0 or filename != splits[i-1].metadata["source"]:
            country = extract_country_from_filename(filename)
        splits[i].metadata["country"] = country
        
    return splits

In [11]:
documents = update_metadata(splits)
documents[50].metadata

{'producer': 'Skia/PDF m139 Google Docs Renderer',
 'creator': '',
 'creationdate': '',
 'source': 'D:\\NextCountry_personal\\db\\Master Document  - STARTUP VISA PROGRAMS (GROUP 1 COUNTRIES).pdf',
 'file_path': 'D:\\NextCountry_personal\\db\\Master Document  - STARTUP VISA PROGRAMS (GROUP 1 COUNTRIES).pdf',
 'total_pages': 24,
 'format': 'PDF 1.4',
 'title': 'Practioner Notes For LLM',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '',
 'trapped': '',
 'modDate': '',
 'creationDate': '',
 'page': 10,
 'country': 'NA'}

## Embedding Engine
---

In [12]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# google_embedding_model = GoogleGenerativeAIEmbeddings(
#     model="models/text-embedding-004",
# )

from langchain_ollama import OllamaEmbeddings
embedding_engine = OllamaEmbeddings(
    base_url="http://54.80.168.47:11434",
    model="bge-m3:latest",
)


## Vector Database Connection

In [13]:
from astrapy import DataAPIClient

# Initialize the client
client = DataAPIClient(ASTRA_DB_APPLICATION_TOKEN)
db = client.get_database_by_api_endpoint(
  ASTRA_DB_API_ENDPOINT
)

print(f"Connected Database: {db.info().name}\nCollections found: {db.list_collection_names()}")

Connected Database: nc-vec-db
Collections found: ['quiz_flow', 'visa_details']


In [15]:
# create collection
from langchain_astradb import AstraDBVectorStore

vector_store = AstraDBVectorStore(
    collection_name="nc_visa_details_vector_db",      # this collection gets created automatically
    embedding=embedding_engine,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace="default_keyspace",
    # autodetect_collection=True,   # set to True whille using it
)

print(f"Connected to collection: {vector_store.collection_name}")

Connected to collection: nc_visa_details_vector_db


### Pushing data to collection

In [16]:
# push embedding to collection
for i in range(0, len(splits), 10):
    chunk = splits[i:i+10]
    try:
        # Add the chunk to the vector store
        vector_store.add_documents(documents=chunk)
        print(f"Chunk {i//10} added successfully")
    except Exception as e:
        print(f"Error adding chunk {i//10 }: {e}")
        continue
    

Chunk 0 added successfully
Chunk 1 added successfully
Chunk 2 added successfully
Chunk 3 added successfully
Chunk 4 added successfully
Chunk 5 added successfully
Chunk 6 added successfully
Chunk 7 added successfully
Chunk 8 added successfully
Chunk 9 added successfully
Chunk 10 added successfully
Chunk 11 added successfully
Chunk 12 added successfully
Chunk 13 added successfully
Chunk 14 added successfully
Chunk 15 added successfully
Chunk 16 added successfully
Chunk 17 added successfully
Chunk 18 added successfully
Chunk 19 added successfully
Chunk 20 added successfully
Chunk 21 added successfully
Chunk 22 added successfully
Chunk 23 added successfully


In [17]:
print(f"Embeddings loaded to: {vector_store.collection_name}")

Embeddings loaded to: nc_visa_details_vector_db


## Check Embeddings

In [18]:
# sanity check for similarity search
results = vector_store.similarity_search_with_score(
    query="How to get PR/Citizenship from Startup in England",
    k=15,
    # filter={"country": "CANADA"}
)
for res, score in results:
    print(f" --->[Similarity score={score:3f}]\n{res.page_content}\n[{res.metadata}]\n")

 --->[Similarity score=0.816783]
3.​ If successful -  receive endorsement letter. 
4.​ Apply online for visa (outside or inside UK). 
5.​ Attend biometric appointment and provide documents. 
6.​ Visa granted for 3 years; leads to Indefinite Leave to Remain (subject to conditions). 
7.​ Apply for British Citizenship after one year from ILR. 
🧠 Business Requirements: 
●​ Business plan must show: 
○​ Market research, customer pain points. 
○​ Differentiation and UK market fit. 
○​ Operational structure and founder commitment. 
○​ Realistic financials and team plan. 
○​ How it is innovative, viable, scalable and original.  
●​ Founders must maintain day-to-day involvement in operations. 
👨‍👩‍👧 Family Relocation: 
●​ Spouse and children under 18 can join. 
●​ Spouse can work without restriction; children can attend school. 
📈 PR & Citizenship: 
●​ ILR after 3 years if business meets success criteria (2 of 7 benchmarks, e.g., job 
creation, customer base growth). 
●​ Citizenship eligible 1 y