In [93]:
from typing import Any, Dict, List, Optional, Union, Tuple
import os
from dotenv import load_dotenv

In [91]:
from qdrant_client import QdrantClient

In [92]:
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
from fastembed import TextEmbedding

In [94]:
from langchain_qdrant import QdrantVectorStore

In [95]:
QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333")
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY", None)

In [96]:
qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

In [97]:
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [19]:
from langchain_qdrant import RetrievalMode

In [98]:
vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="security_reports",
    embedding=embeddings
)

In [40]:
results = vector_store.similarity_search(
    "market maker", k=5
)

In [41]:
results

[Document(metadata={'keywords': [], '_id': 3795654633996337262, '_collection_name': 'security_reports'}, page_content=''),
 Document(metadata={'keywords': ['Clober', 'LOBSTER', 'order book', 'DEX', 'on-chain order matching', 'smart contract platforms', 'limit orders', 'market orders', 'decentralized', 'trustless'], '_id': 2862050251125280971, '_collection_name': 'security_reports'}, page_content=''),
 Document(metadata={'keywords': ['LooksRare', 'NFT', 'Marketplace', 'Exchange V2'], '_id': 3190201343570982494, '_collection_name': 'security_reports'}, page_content=''),
 Document(metadata={'keywords': [], '_id': 822428441679300944, '_collection_name': 'security_reports'}, page_content=''),
 Document(metadata={'keywords': ['LooksRare', 'NFT', 'Marketplace', 'Exchange V2'], '_id': 6693479944826529634, '_collection_name': 'security_reports'}, page_content='')]

In [18]:
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

*  [{'keywords': ['UniswapX', 'Dutch auctions', 'on-chain liquidity', 'off-chain liquidity', 'MEV', 'gas-less swaps', 'signed orders', 'fillers'], '_id': 532385020180679105, '_collection_name': 'security_reports'}]
*  [{'keywords': ['DeFi bonds', 'ERC20', 'DAOs', 'smart contracts', 'zero coupon bonds', 'collateral', 'payment tokens', 'maturity', 'bondholders', 'lenders', 'borrowers'], '_id': 6017722462144705737, '_collection_name': 'security_reports'}]
*  [{'keywords': ['Sudoswap', 'AMM', 'NFT-to-token swaps', 'bonding curves', 'ERC721', 'ERC20'], '_id': 1191993240635384991, '_collection_name': 'security_reports'}]
*  [{'keywords': ['zkEVM-Contracts', 'Polygon-Hermez', 'zkEVM'], '_id': 7156953667558977319, '_collection_name': 'security_reports'}]
*  [{'keywords': ['Beethoven X', 'Sonic Staking', 'decentralized investment platform', 'capital-efficient', 'sustainable solutions'], '_id': 6347570626470248028, '_collection_name': 'security_reports'}]


In [17]:
def chat_with_vectorstore(query: str) -> Dict[str, Any]:
    """
    Tool to retrieve relevant docs relating to the user's query.
    """
    try:
        search_results =  qdrant_client.query(
            collection_name="security_reports",
            query_text=query,
            limit=3
        )

        results = [r.metadata["document"] for r in search_results]

        return {"ai_response": results}
    except Exception as e:
        print(f"Error retrieving docs: {str(e)}")
        return {"ai_response": "An error occurred while retrieving documents."}

In [18]:
result = chat_with_vectorstore("I need security reports of contracts involving MEVs and auctionung systems")

Error retrieving docs: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Wrong input: Vector with name fast-bge-small-en-v1.5 is not configured in this collection"},"time":0.000029284}'


In [42]:
from filehandlers import extract_text_from_doc

In [43]:
from mistralai import Mistral
api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-small-latest"
client = Mistral(api_key=api_key)

In [44]:
import re

In [61]:
from filehandlers import extract_from_txt

In [56]:
def extract_metadata(doc: Any):
    full_text = extract_text_from_doc(doc)
    
    system_prompt = """
    You are an expert document classifier and keyword extractor.

    Instructions:
    - Given an input text, extract the most important keywords that will serve as metadata for auditors to find similar projects.
    - Focus only on niche or sector-specific web3 keywords that uniquely describe the project's core offering.
    - Exclude generic crypto terms (e.g., "token", "hack").
    - Also extract the project's name by analyzing the text and include it among the keywords.
    - Return your output as a comma-separated list of keywords.

    Examples:

    Example 1:
    Text:
    "Collar is a completely non-custodial lending protocol that does not rely on liquidations to remain solvent. Collar is powered by solvers instead of liquidators as well as other DeFi primitives like Uniswap v3.
    Keywords:
    "Collar", "lending", "Uniswap v3"

    Example 2:
    Text:
    "Astaria is a NFT Collateralized Lending Market leveraging a novel 3AM Model.
    Keywords:
    "Astaria", "NFT", "Lending", "Market", "3AM Model"

    Example 3:
    Text:
    "Base is a secure and low-cost Ethereum layer-2 solution built to scale the userbase on-chain.
    Solady is an open source project for gas optimized Solidity snippets.
    Keywords:
    "Base", "Solady", "layer-2"

    Example 4:
    Text:
    "Royco Protocol allows anyone to create a market around any on-chain transaction (or series of transactions). Using Royco, incentive providers may create intents to offer incentives to users to perform the transaction(s) and users may create intents to complete the transaction(s) and/or negotiate for more incentives.
    Keywords:
    "Royco", "Market", "Incentive providers"
    """

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": full_text,
            },
        ]
    )

    keywords = chat_response.choices[0].message.content.replace('\n', ' ').replace('Keywords:', '').strip()
    metadata = re.findall(r'"([^"]+)"', keywords)
    return keywords, metadata, full_text

In [63]:
keywords, metadata, full_text = extract_metadata("/home/j/web3/scoping-agent/files/storage-proofs.txt")

In [64]:
print(full_text)

## About the Project

### Context

At Curve, we offer a Savings Vault for crvUSD, an ERC4626 token that allows earning
a "risk-free" interest rate on the crvUSD stablecoin.

When bridging scrvUSD cross-chain, the token loses its ERC4626 capabilities and becomes
a plain ERC20 token that cannot be minted with nor redeemed using crvUSD.

To address this problem, we opted to have secondary scrvUSD markets on all chains where scrvUSD can be redeemed.
Since the price of the asset is not stable, we cannot use a "simple" [stableswap-ng](https://github.com/curvefi/stableswap-ng/blob/fd54b9a1a110d0e2e4f962583761d9e236b70967/contracts/main/CurveStableSwapNG.vy#L17) pool as the price
of the asset would go up as the yield accrues. Fortunately, stableswap-ng supports "oraclized" assets,
which means that we can use an oracle to provide the rate at which the price of the asset is increasing, ensuring that the pool works as expected.

### Problem

It is a hard problem to guarantee the correctness of th

In [None]:
extract_text_from_doc("files/storage-proofs.txt")

'files/storage-proofs.txt'

In [65]:
metadata

['Curve',
 'crvUSD',
 'ERC4626',
 'scrvUSD',
 'stableswap-ng',
 'oracle',
 'MEV',
 'vault',
 'USDC',
 'FRAX',
 'DAO',
 'LayerZero',
 'OP stack',
 'blockhash']

In [67]:
from qdrant_client.http import models

In [87]:
filter_condition = models.Filter(
    should=[
        models.FieldCondition(
            key="metadata.keywords", 
            match=models.MatchAny(any=metadata)
        )
    ]
)

results = qdrant_client.scroll(
    collection_name="security_reports",
    scroll_filter=filter_condition
)

In [85]:
def flatten_records(obj):
    """
    Recursively yields individual record objects from a nested structure
    (e.g., a tuple containing lists, etc.)
    """
    if isinstance(obj, (list, tuple)):
        for item in obj:
            yield from flatten_records(item)
    else:
        yield obj

def get_file_names(records: Any) -> List[str]:
    file_names = set()
    for record in flatten_records(records):
        # Check if record has a 'payload' attribute or key.
        if hasattr(record, 'payload'):
            payload = record.payload
        elif isinstance(record, dict) and "payload" in record:
            payload = record["payload"]
        else:
            continue
        
        if isinstance(payload, dict) and "file_name" in payload:
            file_names.add(payload["file_name"])
    return list(file_names)


In [72]:
results

([Record(id=119691268391443, payload={'mongo_id': '67ca0c10ed3fdfebeaf700d9', 'file_name': 'Fastlane-Spearbit-Security-Review-April-2024.pdf', 'metadata': {'keywords': ['Fastlane', 'Atlas', 'MEV', 'Execution Abstraction', 'Solvers', 'User Operation', 'bundler', 'auction system']}, 'chunk_index': 248, 'chunk_text': 'can be retrieved from function arguments already.\ncontrol.codehash is used for the following check:\nmodifier validControlHash() {\nif (_control().codehash != _controlCodeHash()) {\nrevert("ERR-EV008 InvalidCodeHash");\n}\n_;\n}\nThis check is to account for the possibility of changing the code via selfdestruct? With Dencun upgrade, selfdestruct\ncan destroy the code only when called in the creation tx (rollups and other EVM chains haven\'t upgraded to Dencun\nyet). However, there are cases where this check doesn\'t provide any protection against:\n• If control is a proxy, the implementation can change without changing its codehash.\n• control can change its execution witho

In [88]:
print(get_file_names(results))

['Fastlane-Spearbit-Security-Review-April-2024.pdf', 'Uniswapx-Spearbit-Security-Review-July-2024.pdf']


In [89]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [112]:
vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="security_reports",
    embedding=embeddings,
    content_payload_key="chunk_text"
)

In [90]:
chat_with_vectorstore_prompt = (
    "You are a seasoned security expert specializing in web3 security and smart contract auditing."
    "The context a comprehensive vector store of security reports, which include detailed risk classifications, vulnerabilities, and technical assessments."
    "When you receive a question, provide a precise, evidence-based answer that directly references the retrieved context."
    "Your answer should be structured clearly (using bullet points, headers, or numbered lists as needed) and avoid generic or vague responses."
    "Context: {context}"
)

In [113]:
retriever = vector_store.as_retriever()

In [101]:
from langchain.chat_models import init_chat_model

In [102]:
llm = init_chat_model("mistral-large-latest", model_provider="mistralai")

In [114]:
def chat_with_vectorstore(query: str) -> Dict[str, Any]:
        """
        Tool to retrieve relevant docs relating to the user's query.
        """
        try:
            prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", chat_with_vectorstore_prompt),
                    ("human", "{input}"),
                ]
            )

            question_answer_chain = create_stuff_documents_chain(llm, prompt)
            chain = create_retrieval_chain(retriever, question_answer_chain)
            results = chain.invoke({"input": query})

            return {"ai_response": results}
        except Exception as e:
            print(f"Error retrieving docs: {str(e)}")
            return {"ai_response": "An error occurred while retrieving documents."}

In [139]:
query = "Was there any reentrancy vulnerability in the fastlane report?"
result = chat_with_vectorstore(query)

In [140]:
result['ai_response']['answer']

"Based on the provided context, there is no explicit mention of a reentrancy vulnerability in the Fastlane Atlas security review report. Here’s a breakdown of the issues found and their severities:\n\n- **Critical Risk**: 0 issues\n- **High Risk**: 15 issues\n- **Medium Risk**: 25 issues\n- **Low Risk**: 35 issues\n- **Gas Optimizations**: 34 issues\n- **Informational**: 81 issues\n\nGiven that reentrancy vulnerabilities are typically classified as high or critical risk due to their severe impact, the absence of any critical risk issues and the lack of specific mention of reentrancy in the high risk issues suggests that no reentrancy vulnerabilities were identified during the audit.\n\nHowever, without the detailed descriptions of each high risk issue, it's not possible to definitively conclude that no reentrancy vulnerabilities exist. The report would need to be reviewed in full to ensure that none of the high risk issues are related to reentrancy."

In [132]:
import re
from textwrap import dedent

In [133]:
def format_security_report(report_text):
    """
    Format a security report into a clean, structured output
    regardless of input format.
    
    Args:
        report_text (str): The raw text of the security report
    
    Returns:
        str: Formatted report as Markdown
    """

    
    # Remove excessive newlines and whitespace
    cleaned_text = re.sub(r'\n{3,}', '\n\n', report_text.strip())
    
    # Extract title/headline if present
    title_match = re.search(r'^(.+?)(?=\n|$)', cleaned_text)
    title = title_match.group(0) if title_match else "Security Vulnerability Report"
    
    # Extract severity if present
    severity_match = re.search(r'(?i)severity[\s:]*(critical|high|medium|low|informational|info)', cleaned_text)
    severity = severity_match.group(1).capitalize() if severity_match else None
    
    # Extract location/file path if present
    location_match = re.search(r'(?i)(?:location|file|path|contract)[\s:]*([\w\.\/#\-]+\.(?:sol|js|ts|py|jsx|vue)(?:[#:][L\d\-]+)?)', cleaned_text)
    location = location_match.group(1) if location_match else None
    
    # Build formatted output
    formatted_output = f"# {title}\n\n"
    
    if severity or location:
        formatted_output += "## Overview\n\n"
        if severity:
            formatted_output += f"**Severity:** {severity}\n\n"
        if location:
            formatted_output += f"**Location:** `{location}`\n\n"
    
    # Process the main content
    # Remove markdown-like artifacts but keep structure
    content = re.sub(r'(^|\n)#+ ', r'\1### ', cleaned_text)
    
    # Extract description section
    description_match = re.search(r'(?i)## *description\s*(.*?)(?=\n##|\Z)', content, re.DOTALL)
    if description_match:
        description = description_match.group(1).strip()
        formatted_output += f"## Description\n\n{description}\n\n"
    
    # Extract impact section
    impact_match = re.search(r'(?i)## *impact\s*(.*?)(?=\n##|\Z)', content, re.DOTALL)
    if impact_match:
        impact = impact_match.group(1).strip()
        formatted_output += f"## Impact\n\n{impact}\n\n"
    
    # Extract technical details
    technical_match = re.search(r'(?i)## *(technical details|vulnerability details)\s*(.*?)(?=\n##|\Z)', content, re.DOTALL)
    if technical_match:
        technical = technical_match.group(2).strip()
        formatted_output += f"## Technical Details\n\n{technical}\n\n"
    
    # Extract mitigation
    mitigation_match = re.search(r'(?i)## *(mitigation|recommendation|fix|remediation)\s*(.*?)(?=\n##|\Z)', content, re.DOTALL)
    if mitigation_match:
        mitigation = mitigation_match.group(2).strip()
        formatted_output += f"## Mitigation\n\n{mitigation}\n\n"
    
    # If we didn't extract structured sections, include the whole content
    if not any([description_match, impact_match, technical_match, mitigation_match]):
        # Remove the title part we already used
        main_content = re.sub(r'^.+?\n', '', cleaned_text, 1)
        formatted_output += f"## Details\n\n{main_content}\n"
    
    return formatted_output.strip()

In [141]:
formatted_report = format_security_report(result['ai_response']['answer'])

In [142]:
print(formatted_report)

# Based on the provided context, there is no explicit mention of a reentrancy vulnerability in the Fastlane Atlas security review report. Here’s a breakdown of the issues found and their severities:

## Details


- **Critical Risk**: 0 issues
- **High Risk**: 15 issues
- **Medium Risk**: 25 issues
- **Low Risk**: 35 issues
- **Gas Optimizations**: 34 issues
- **Informational**: 81 issues

Given that reentrancy vulnerabilities are typically classified as high or critical risk due to their severe impact, the absence of any critical risk issues and the lack of specific mention of reentrancy in the high risk issues suggests that no reentrancy vulnerabilities were identified during the audit.

However, without the detailed descriptions of each high risk issue, it's not possible to definitively conclude that no reentrancy vulnerabilities exist. The report would need to be reviewed in full to ensure that none of the high risk issues are related to reentrancy.
