In [1]:
# install rank-bm25 sentence-transformers torch --upgrade
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import torch, numpy as np
import re

In [2]:
# Heuristic tokenizer for demonstration purposes
def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    tokens = text.split()
    stopwords = {"the", "a", "an", "and", "or", "of", "in",
                 "on", "for", "to", "is", "are"}
    return [token for token in tokens if token not in stopwords]

In [3]:
# Demo sample corpus
docs = [
    "Guide to VAT reverse charge in Poland for B2B services",
    "Solar energy incentives and photovoltaic power regulations",
    "Understanding reverse charge mechanism for EU VAT rules",
    "Troubleshooting error code E1234 on GPU drivers",
    "Comprehensive overview of GDPR compliance for small businesses",
    "Beginnerâ€™s guide to neural networks and deep learning concepts",
    "Managing supply chain risk in global manufacturing industries",
    "Effective marketing automation strategies for e-commerce brands",
    "Impact of climate change on coastal erosion and flood defenses",
    "Setting up Kubernetes clusters on AWS for microservices",
    "Understanding ISO 27001 information security certification",
    "Optimizing SQL queries for better database performance",
    "Corporate income tax obligations for foreign subsidiaries",
    "Developing blockchain smart contracts using Solidity",
    "European Green Deal targets and sustainability reporting",
    "Best practices for remote team collaboration and productivity",
    "Machine learning approaches for credit risk assessment",
    "Introduction to quantum computing and Qiskit basics",
    "Employee data protection under EU labor regulations",
    "Financial forecasting with Python and time series models",
    "Troubleshooting slow website performance and SEO issues",
    "Renewable energy project financing and investment options",
    "Writing unit tests in JavaScript with Jest framework",
    "Understanding inflation trends in post-pandemic economies",
    "Guide to containerization and Docker image optimization",
    "Comparing renewable and fossil energy efficiency ratios",
    "Compliance checklist for import/export customs documentation",
    "Deploying machine learning models using Flask and FastAPI",
    "Healthcare data interoperability under HL7 and FHIR standards",
    "Investing in ETFs versus individual stocks: pros and cons",
    "AI-powered fraud detection in fintech applications",
    "Principles of agile software development and scrum sprints",
    "European patent application process and documentation",
    "Managing cloud costs and resource allocation in Azure",
    "Psychological impact of remote learning on students",
    "Understanding carbon offset programs and their limitations",
    "Best practices for cybersecurity incident response plans",
    "Ethical implications of AI-driven decision making",
    "Overview of the US SEC reporting requirements for IPOs",
    "Introduction to RESTful API design and HTTP methods",
    "Electric vehicle charging infrastructure regulations",
    "Predictive maintenance in industrial IoT environments",
    "Effective data visualization techniques using Tableau",
    "Corporate sustainability KPIs and ESG performance metrics",
    "Understanding fiscal policy and central bank interventions",
    "Cross-border e-commerce tax and customs considerations",
    "Deep reinforcement learning for autonomous systems",
    "User experience design principles for mobile apps",
    "Biodegradable packaging innovations in food industry",
    "Cloud data backup and disaster recovery best practices",
    "Introduction to digital twins and industrial simulation models",
    "Advancements in 5G networks and mobile connectivity",
    "Telemedicine adoption in rural healthcare systems",
    "Electric scooter regulations in European cities",
    "Carbon capture technologies for industrial emissions",
    "Analyzing social media trends with Python scripts",
    "Introduction to edge computing and IoT devices",
    "Smart home automation using voice assistants",
    "Financial literacy programs for young adults",
    "AI ethics in autonomous vehicle decision-making",
    "Water conservation strategies in urban areas",
    "Best practices for remote software testing",
    "Improving mental health with mindfulness techniques",
    "Circular economy initiatives in manufacturing",
    "Cryptocurrency tax reporting guidelines",
    "Microbial research for sustainable agriculture",
    "Quantum encryption methods for secure communication",
    "Travel planning using AI-powered recommendation engines",
    "Impact of teleworking on employee productivity",
    "E-sports tournament organization and management",
    "Preventing phishing attacks in corporate networks",
    "Mobile payment solutions in emerging markets",
    "Digital art marketplaces and NFT trading",
    "Space debris tracking and satellite safety",
    "Genetic testing for personalized medicine",
    "Urban mobility planning with autonomous shuttles",
    "Carbon footprint measurement tools for companies",
    "Smart grid technology for renewable energy integration",
    "Crowdfunding strategies for startups",
    "Wildlife conservation using drone monitoring",
    "Personalized learning platforms in K-12 education",
    "Blockchain adoption in supply chain management",
    "Open source software licensing compliance",
    "Electric aviation technology and safety standards",
    "IoT-based predictive maintenance in factories",
    "Developing mobile health applications for diabetes",
    "Automated content moderation on social platforms",
    "Climate-resilient infrastructure design",
    "Telecommunications regulations in developing nations",
    "3D printing for rapid prototyping and production",
    "Fintech regulations for digital banks",
    "Virtual reality therapy for PTSD patients",
    "Machine learning for natural language processing",
    "Smart agriculture with sensor-driven irrigation",
    "Legal challenges in AI-generated intellectual property",
    "Water quality monitoring using IoT sensors",
    "Renewable energy microgrids for remote communities",
    "Ethical hacking techniques for cybersecurity training",
    "Data governance frameworks for multinational firms",
    "Introduction to bioinformatics and genomic data analysis",
    "Smart city initiatives for traffic optimization",
    "AI-powered recommendation engines for e-commerce",
    "Advances in battery storage technology",
    "Sustainable fashion and ethical sourcing practices",
    "Voice user interface design for mobile apps",
    "Corporate governance principles in multinational companies",
    "Drone delivery systems in urban logistics",
    "Predictive analytics in healthcare diagnostics",
    "Digital twin technology in manufacturing processes",
    "Carbon-neutral transportation initiatives",
    "Cybersecurity strategies for small businesses",
    "High-performance computing in climate modeling",
    "Cloud-native application deployment best practices",
    "Marine conservation using satellite imaging",
    "Employee engagement strategies for hybrid teams",
    "E-learning platforms for vocational training",
    "IoT-enabled smart warehouses and logistics",
    "Privacy-preserving data analysis methods",
    "AI-assisted content creation tools",
    "Robotic process automation for finance teams",
    "Mental health apps for stress management",
    "Green building certifications and standards",
    "Autonomous drones for agriculture monitoring",
    "Data visualization dashboards for business intelligence",
    "Electric bus adoption in public transport",
    "Introduction to federated learning models",
    "Biometric authentication systems for security",
    "Crowdsourced mapping for disaster response",
    "Renewable energy certificate trading systems",
    "Industrial automation using collaborative robots",
    "Carbon offset investment opportunities",
    "Micro-mobility solutions in smart cities",
    "AI-driven market sentiment analysis",
    "Developing voice recognition applications",
    "Smart water management in agriculture",
    "Regenerative agriculture practices for soil health",
    "Digital marketing analytics using machine learning",
    "Telehealth platforms for mental health counseling",
    "Fintech solutions for microfinance institutions",
    "AI-based fraud detection in insurance claims",
    "Optimizing logistics with route planning algorithms",
    "Energy-efficient data centers and cooling methods",
    "E-commerce customer segmentation using AI",
    "Urban air quality monitoring and mitigation strategies",
    "Blockchain for identity verification solutions",
    "Personal finance management apps and tools",
    "Sustainable packaging solutions for consumer goods",
    "Robotics in elderly care and assistance",
    "Open data initiatives for civic engagement",
    "Machine vision for quality control in factories",
    "Edge AI applications in smart factories",
    "Predictive maintenance for renewable energy systems",
    "Telematics solutions for fleet management",
    "AI-assisted drug discovery in pharmaceutical research",
    "Water scarcity solutions with desalination technology",
    "Developing cybersecurity awareness programs",
    "Smart sensors for structural health monitoring",
    "Digital transformation strategies for SMEs",
    "Precision agriculture using drone and satellite data",
    "Adaptive learning systems in higher education",
    "Impact of automation on labor markets",
    "AI-powered chatbots for customer support",
    "Renewable energy adoption incentives by governments",
    "Blockchain in cross-border payment systems",
    "Sustainable urban planning for low-carbon cities",
    "Wearable technology for health monitoring",
    "Predictive modeling for stock market trends",
    "EU VAT compliance for digital services",
    "Intracommunity VAT reporting obligations",
    "Cross-border VAT registration in the EU",
    "EU VAT reverse charge for B2B sales",
    "VAT rates comparison across EU member states",
    "EU VAT invoicing requirements for businesses",
    "Impact of Brexit on EU VAT rules",
    "VAT refund procedures for foreign companies",
    "EU VAT Mini One Stop Shop explained",
    "Differences between standard and reduced VAT rates",
    "EU VAT treatment for e-commerce platforms",
    "VAT on imported goods from non-EU countries",
    "European Commission VAT directives overview",
    "EU VAT exemptions for small enterprises",
    "VAT penalties and compliance audits in the EU",
    "Applying VAT to digital products in the EU",
    "VAT obligations for cross-border services",
    "EU VAT rules for electronic marketplaces",
    "VAT implications of intra-EU supply chains",
    "EU VAT reporting using SAF-T standards",
    "VAT on telecommunication services in the EU",
    "Understanding VAT triangulation in Europe",
    "EU VAT treatment for B2C transactions",
    "Impact of EU VAT changes on SMEs",
    "EU VAT invoicing software compliance",
    "VAT liability for non-established EU businesses",
    "EU VAT on imports under reverse charge",
    "VAT compliance for EU marketplaces and platforms",
    "EU VAT MOSS scheme for digital services",
    "VAT on cross-border e-commerce sales",
    "EU VAT and luxury goods taxation",
    "Financial reporting under EU VAT rules",
    "EU VAT on consultancy and professional services",
    "VAT implications for EU franchise operations",
    "EU VAT threshold limits for remote sellers",
    "Cross-border VAT audits in EU member states",
    "EU VAT compliance checklist for exporters",
    "VAT treatment for EU supply of software",
    "EU VAT and electronic invoicing mandates",
    "EU VAT on telecommunication and broadcasting",
    "VAT obligations for EU online retailers",
    "EU VAT compliance risk management",
    "VAT reporting under EU OSS scheme",
    "EU VAT impact on multinational corporations",
    "Reverse charge VAT on construction services EU",
    "EU VAT on financial services transactions",
    "VAT for EU business-to-business digital services",
    "EU VAT penalties and late filing consequences",
    "VAT on EU intra-community acquisitions",
    "EU VAT compliance training for accountants",
    "EU VAT for shipping and logistics companies",
    "VAT rules for EU cross-border leasing",
    "EU VAT on imports under special regimes",
    "EU VAT on professional and legal services",
    "Financial planning considering EU VAT changes",
    "EU VAT obligations for foreign digital suppliers",
    "EU VAT reporting for e-commerce marketplaces",
    "VAT treatment of EU cross-border telemedicine",
    "EU VAT on software-as-a-service subscriptions",
    "EU VAT invoicing requirements for freelancers",
    "VAT treatment for EU cross-border training services",
    "EU VAT reverse charge mechanism in construction",
    "Financial audits and EU VAT compliance",
    "EU VAT and energy sector transactions",
    "EU VAT for supply of goods via platforms",
    "VAT recovery procedures in EU member states",
    "EU VAT registration for non-resident entities",
    "VAT implications for intra-EU trade",
    "EU VAT on transport and logistics services",
    "EU VAT for e-books and digital publications",
    "VAT on cross-border EU hospitality services",
    "EU VAT compliance in online marketplaces",
    "Financial software solutions for EU VAT",
    "EU VAT obligations for telecommunication companies",
    "EU VAT treatment for subscription services",
    "EU VAT threshold exemptions explained",
    "VAT accounting for cross-border B2B EU sales",
    "EU VAT compliance for foreign service providers",
    "VAT reporting requirements under EU law",
    "EU VAT treatment for electronic services",
    "EU VAT and travel agent margin schemes",
    "Reverse charge rules in EU VAT legislation",
    "EU VAT on intra-community triangular transactions",
    "VAT implications for EU cross-border workshops",
    "EU VAT compliance strategy for multinational firms",
    "EU VAT treatment of import-export transactions",
    "VAT refund schemes for EU tourists",
    "EU VAT for international freight forwarding",
    "EU VAT on professional training and seminars",
    "VAT on EU cross-border digital advertising",
    "EU VAT reporting timelines and deadlines",
    "VAT compliance risk assessment in EU operations",
    "EU VAT treatment for consulting firms",
    "Cross-border VAT planning for EU businesses",
    "EU VAT and electronic invoicing standards",
    "VAT rules for EU intra-community deliveries",
    "EU VAT implications for SaaS providers",
    "Financial reporting adjustments for EU VAT",
    "EU VAT and international trade agreements",
    "VAT considerations for EU e-commerce startups",
    "EU VAT compliance for import-export companies",
    "VAT treatment of cross-border EU maintenance services",
    "EU VAT and cross-border logistics solutions",
    "EU VAT obligations for small online vendors",
    "VAT implications for intra-EU wholesale trading",
    "EU VAT on B2B digital marketplace transactions",
    "EU VAT for telecom and broadcasting providers",
    "VAT recovery for EU business travel expenses",
    "EU VAT and supply chain financial management",
    "EU VAT reverse charge on goods and services",
    "VAT reporting automation for EU financial teams",
]

In [5]:
bm25 = BM25Okapi([tokenize(d) for d in docs])

In [6]:
def bm25_candidates(query, k=50):
    scores = bm25.get_scores(tokenize(query))
    idx = np.argsort(scores)[::-1][:min(k, len(docs))]
    return [(int(i), float(scores[i])) for i in idx]

In [7]:
# Cross-encoder re-ranker (MiniLM, use monoT5 for higher accuracy if you can afford it)
device = "cuda" if torch.cuda.is_available() else "cpu"
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=device, max_length=512)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [8]:
def rerank(query, candidates, topk=5, batch_size=16):
    pairs = [(query, docs[i]) for i, _ in candidates]
    scores = reranker.predict(pairs, batch_size=batch_size)
    order = np.argsort(scores)[::-1][:topk]
    return [(candidates[i][0], float(scores[i])) for i in order]

In [9]:
query = "VAT reverse charge Poland"
cands = bm25_candidates(query, k=50)       # recall-oriented stage
reranked = rerank(query, cands, topk=5)    # precision-oriented stage

In [10]:
print("Top-5 after re-ranking:")
for rank, (idx, score) in enumerate(reranked, 1):
    print(f"{rank:2d}. {score:6.3f} :: {docs[idx]}")

Top-5 after re-ranking:
 1.  7.801 :: Guide to VAT reverse charge in Poland for B2B services
 2.  2.820 :: EU VAT on imports under reverse charge
 3.  2.807 :: EU VAT reverse charge on goods and services
 4.  2.537 :: Reverse charge VAT on construction services EU
 5.  2.497 :: EU VAT reverse charge mechanism in construction
