In [85]:
from pymongo import MongoClient
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.llms import openai
from langchain.chains import retrieval_qa
import os
from dotenv import load_dotenv, find_dotenv
import utils
from utils import logger
from pathlib import Path
import json
import pickle
load_dotenv(find_dotenv())

ATLAS_DB_URI = ''.join(["mongodb+srv://",
                f"{os.environ.get('ATLAS_DB_USERNAME')}:",
                f"{os.environ.get('ATLAS_DB_PASSWORD')}",
                "@ai-auditor-prod.cwtxo73.mongodb.net/?retryWrites=true&w=majority",
                "&appName=ai-auditor-prod"])

def ping_mongodb():
    client = MongoClient(ATLAS_DB_URI)
    try:
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
    except Exception as e:
        print(e)


In [2]:
def load_data(
        save_to_pickle=True, 
        force_reload=False, 
        suffix="_vulnerabilities_formatted.txt"
):
    if os.path.exists(Path(utils.DATADIR) / 'data.pickle') and not force_reload:
        logger.info("Loading data from pickle file")
        with open(Path(utils.DATADIR) / 'data.pickle', 'rb') as f:
            data = pickle.load(f)
        return data

    datafiles = [file for file in os.listdir(utils.DATADIR)
                if os.path.isfile(Path(utils.DATADIR) / file)
                and file.endswith('.txt')]
    data = {}
    logger.info("Starting data load, loading data from %s files", len(datafiles))
    for file in datafiles:
        logger.info("Reading file: %s", file)
        section_data = []
        with open(Path(utils.DATADIR) / file, 'r', encoding='utf-8') as f:
            read_data = False
            json_data = ""
            for line in f:
                if "----Start JSON----" in line:
                    read_data = True
                    continue

                if "----End JSON----" in line:
                    read_data = False
                    try:
                        section_data.append(json.loads(json_data))
                    except json.JSONDecodeError as e:
                        pass
                    logger.debug("Read JSON Data: %s", section_data[-1].keys())
                    json_data = ""
                    continue

                if read_data:
                    json_data += line
            
            data[file.removesuffix(suffix)] = section_data
            
    logger.info("Loaded data from files")
    if save_to_pickle:
        logger.info("Saving data to pickle file")
        with open(Path(utils.DATADIR) / 'data.pickle', 'wb') as f:
            pickle.dump(data, f)
    
    return data

### Load data

In [49]:
vuln_data = load_data(force_reload=True)

2024-05-29 20:12:45,670 - INFO - 517238322 - load_data - Starting data load, loading data from 5 files
2024-05-29 20:12:45,671 - INFO - 517238322 - load_data - Reading file: ConsenSys_vulnerabilities_formatted.txt
2024-05-29 20:12:45,674 - DEBUG - 517238322 - load_data - Read JSON Data: dict_keys(['code', 'Resolution', 'Description', 'Example', 'Recommendation'])
2024-05-29 20:12:45,676 - DEBUG - 517238322 - load_data - Read JSON Data: dict_keys(['code', 'Resolution', 'Description', 'Example', 'Recommendation'])
2024-05-29 20:12:45,678 - DEBUG - 517238322 - load_data - Read JSON Data: dict_keys(['code', 'Resolution', 'Description', 'Examples', 'Recommendation'])
2024-05-29 20:12:45,680 - DEBUG - 517238322 - load_data - Read JSON Data: dict_keys(['code', 'Resolution', 'Description', 'Examples', 'Recommendation'])
2024-05-29 20:12:45,681 - DEBUG - 517238322 - load_data - Read JSON Data: dict_keys(['code', 'Resolution', 'Description', 'Examples', 'Recommendation'])
2024-05-29 20:12:45,684

In [4]:
vuln_data.keys()

dict_keys(['ConsenSys', 'Cyfrin', 'Pashov_Audit_Group', 'Sherlock', 'Trust_Security'])

In [50]:
import itertools
no_sherlock_data = [vuln_data[k] for k in vuln_data.keys() 
                    if k.lower() != "sherlock"]

no_sherlock_data = list(itertools.chain(*no_sherlock_data))
sherlock_data = vuln_data["Sherlock"]

In [36]:
keys = set(list(no_sherlock_data[0].keys()))
for data in no_sherlock_data:
    keys.update(list(data.keys()))
keys

{'Addition Overflows',
 'Affected Assets',
 'Beanstalk',
 'Beanstalk Farms',
 'Beefy',
 'Client',
 'Client response',
 'Conclusion',
 'Cyfrin',
 'Cyrin',
 'Descriptio',
 'Description',
 'Description and Recommendation',
 'Details',
 'Dexe',
 'Discussion',
 'Division Overflows',
 'Example',
 'Example: RocketNetworkPrices',
 'Examples',
 'Explanation',
 'Impact',
 'Likelihood',
 'Mitigating factors',
 'Mitigation',
 'Mitigation Review',
 'Mitigation Review 2',
 'Mitigation review',
 'Mitigation review 2',
 'Mitigations',
 'Mitigiation review',
 'Mode',
 'Multiplication Overflows',
 'Non-exhaustive Examples',
 'Note',
 'POC',
 'Pedantic Note',
 'Proof of Concept',
 'Protocol',
 'Recommendation',
 'Recommendations',
 'Recommended Mitigation',
 'Recommended mitigation',
 'References',
 'Remark',
 'Remediation',
 'Resolution',
 'RocketMinipoolBondReducer',
 'RocketNetworkPenalties',
 'Severity',
 'Solidly',
 'Solidly Labs',
 'Sudoswap',
 'Swell',
 'Team Response',
 'Team response',
 'Wormhol

In [34]:
keys = {}
for data in no_sherlock_data:
    for key in data:
        if keys.get(key):
            keys[key] += 1
        else:
            keys[key] = 1

keys

{'code': 1379,
 'Resolution': 518,
 'Description': 1349,
 'Example': 12,
 'Recommendation': 672,
 'Examples': 284,
 'Remark': 3,
 'preamble': 581,
 'Description and Recommendation': 2,
 'Example: RocketNetworkPrices': 2,
 'RocketMinipoolBondReducer': 2,
 'RocketNetworkPenalties': 2,
 'borrowCurrencyId': 1,
 'liquidationRate and minCollateralRatioBPS': 1,
 'maxBorrowMarketIndex': 1,
 'secondaryBorrowCurrencies': 1,
 'Descriptio': 1,
 'Recommendations': 408,
 '_updatePrices()': 1,
 'virtualPrice()': 1,
 'Conclusion': 3,
 'Affected Assets': 2,
 'Remediation': 20,
 'Non-exhaustive Examples': 1,
 'Mitigating factors': 2,
 'References': 2,
 'Explanation': 1,
 'Addition Overflows': 1,
 'Multiplication Overflows': 1,
 'Division Overflows': 1,
 'Mitigations': 1,
 'Details': 1,
 'Pedantic Note': 1,
 'Mitigation': 1,
 'Impact': 535,
 'Proof of Concept': 62,
 'Recommended Mitigation': 210,
 'Solidly Labs': 2,
 'Cyfrin': 100,
 'Severity': 338,
 'Client': 37,
 'Wormhole Foundation': 12,
 'Beefy': 6,

In [110]:
include_keys = {'Addition Overflows',
 'Affected Assets',
 'Beanstalk',
 'Beanstalk Farms',
 'Beefy',
 'Client',
 'Cyfrin',
 'Cyrin',
 'Descriptio',
 'Description',
 'Description and Recommendation',
 'Details',
 'Dexe',
 'Discussion',
 'Division Overflows',
 'Impact',
 'Likelihood',
 'Mode',
 'Multiplication Overflows',
 'Protocol',
 'RocketMinipoolBondReducer',
 'RocketNetworkPenalties',
 'Severity',
 'Solidly',
 'Solidly Labs',
 'Sudoswap',
 'Swell',
 'Wormhole',
 'Wormhole Foundation',
 '_updatePrices()',
 'borrowCurrencyId',
 'liquidationRate and minCollateralRatioBPS',
 'lpSupply',
 'maxBorrowMarketIndex',
 'pashov',
 'secondaryBorrowCurrencies',
 'virtualPrice()'}

include_keys_lower = set([k.lower() for k in include_keys])


### Clean the dataset

Clean the dataset, split the code into functions and concat all of the language sections into a tex field that our model can understand/learn

In [111]:
def get_cleaned_explanations(entry, exclude_keys=[]):
    text = ''
    EXCLUDE_KEYS = [
        'code',
        'Recommendation',
        "Mitigation Review",
        "Recommendations",
        "Cyfrin",
        "Client",
        "Recommended Mitigation",
        "Resolution",
        "Example",
        "preamble"
    ] + exclude_keys
    
    for k in entry:
        if k.lower() in include_keys_lower:
            text += k + ': ' + ''.join(entry[k]) + ' '
    
    text.replace('\n', ' ')

    return text

In [52]:
def split_code_by_function(code):
    snippets = []
    parentheses = []
    opened = False
    curr_function = ''

    opposite = {
        '}': '{',
        ']': '[',
        ')': '('
    }
    for char in code:
        if char in '{([':
            if char == '{':
                
                opened = True
            parentheses.append(char)
        elif char in '}])':
            if parentheses:
                if opposite[char] == parentheses[-1]:  
                    parentheses.pop()
        
        curr_function += char

        if opened:
            if not parentheses:
                opened = False
                snippets.append(curr_function)
                curr_function = ''
    
    snippets.append(curr_function)

    return snippets

In [53]:
def split_and_combine_code(code, min_snippet_len=10, max_snippet_len=1000):
    return_code = []
    for snippet in code:
        return_code += [s for s in split_code_by_function(snippet) if s
                        and len(s) > min_snippet_len
                        and len(s) < max_snippet_len]
    
    return return_code


In [149]:
split_and_combine_code(no_sherlock_data[1]['code'])[3]

';\nconst json = await response.json();\n\n'

In [152]:
print(no_sherlock_data[1]['code'][0])

if (assetName.startsWith('W')) {
 // Assume this is a wrapped token
 assetName = assetName.slice(1); // remove W
}
try {




In [157]:
split_and_combine_code(no_sherlock_data[1]['code'])

["if (assetName.startsWith('W')) {\n // Assume this is a wrapped token\n assetName = assetName.slice(1); // remove W\n}",
 'const response = await fetch(\n `https://api.binance.com/api/v3/ticker/price?symbol=${assetName.toUpperCase()}USDT`,\n)',
 ';\nconst json = await response.json();\n\n']

In [48]:
no_sherlock_data[1006]

{'code': [],
 'Severity': [''],
 'Impact': [' High, because the accounting would go wrong for multiple scenarios'],
 'Likelihood': [' Medium, because it would happen when admin calls changeAsset()'],
 'Description': ['',
  'In general updating underlying asset is very risky move in a pool.\nAll the cached prices will be wrong.',
  'In the current code we have two cached prices(that I know of):\nIn requestRedeem() code caches pool prices for requests. Code use it later in the withdraw and cancel request. (the price impact withdraw price and also burning tokens in cancel requests)',
  'In calculating fee, code caches pool price and use it to calculate fee later.',
  '(there may be other places the pool price is cached)',
  '———\nAnother place that is asset amount is cached is claimableAssetFees. updateAsset() calls the _collectFees() to handle the claimableAssetFees and set it to zero but because of this line in the _collectFees()\nIf (profit == 0) return;\nclaimableAssetFees (which show

In [54]:
cleaned_no_sherlock_data = []

for i,entry in enumerate(no_sherlock_data):
    if not entry['code']:
        logger.debug(entry)
        continue

    cleaned_no_sherlock_data.append(
        (
            split_and_combine_code(entry['code']),
            get_cleaned_explanations(entry)
        )
    )
    


2024-05-29 20:13:33,746 - DEBUG - 1169941704 - <module> - {'code': [], 'Resolution': ['Addressed with the following changesets: fort-major/msq@7f9cde2 and fort-major/msq@0b9f8d1 (removing whitelisted method names, only allowing  icrc1_transfer)', 'The client provided the following statement:'], 'Description': ['Identities are bound to their origin (URL). Third-party origins are outside the scope of this Snap and are therefore in a lower trust zone where it is unsure what security measures are in place to protect the dApp from impersonating the users’ wallet identity. dApps may be hosted on integrity protecting endpoints (ipfs/IC), however, this is not enforced.', 'Protected RPC functions can only be invoked by the MSQ administrative origin. User consent may not consistently be enforced on the administrative origin.', 'The administrative origin is identified by the origin URL. According to the client the dApp is hosted on an integrity protecting endpoint (IC). This already protects from

2024-05-29 20:13:33,859 - DEBUG - 1169941704 - <module> - {'code': [], 'preamble': [], 'Description': ['As part of the process of bringing the application to production readiness, dev comments (especially TODOs) should be resolved. In many cases, these comments indicate a missing functionality that should be implemented, or some missing necessary validation checks.']}
2024-05-29 20:13:33,862 - DEBUG - 1169941704 - <module> - {'code': [], 'preamble': [], 'Resolution': ['Comment from Connext:', 'Indeed, since the user has to sign messages, it has to be an EOA, and, consequently, the suggested solution would exclude contracts from calling prepare. A slight modification of the recommendation should work, though: Instead of checking msg.sender == invariantData.user, add a new member initiator (or msgSender or something similar) to the InvariantTransactionData struct, and check msg.sender == invariantData.initiator in the prepare function.\nThat would fix the issue and still allow prepare ca

In [145]:
code = "    function getFees(...)\n        internal\n        view\n        returns (uint256 feeInToken, uint256 nativeFees)\n    {\n@>      nativeFees = controller.getMinFees(connector, gasLimit, payloadSize);\n        feeInToken = Configuration.getStaticWithdrawFee(token, connector);\n    }\n    function executeBridging(...)\n        internal\n    {\n        ISocketControllerWithPayload socketController =\n            ISocketControllerWithPayload(Configuration.getController(withdrawToken));\n\n        (uint256 tokenFees, uint256 nativeFees) =\n            getFees(withdrawToken, socketController, socketConnector, socketMsgGasLimit, socketPayloadSize);\n        if (tokenAmount > tokenFees) {\n            uint256 tokensToWithdraw = tokenAmount - tokenFees;\n@>          socketController.bridge{ value: nativeFees }({\n                receiver_: receiver,\n                amount_: tokensToWithdraw,\n                msgGasLimit_: socketMsgGasLimit,\n                connector_: socketConnector,\n                execPayload_: abi.encode(),\n                options_: abi.encode()\n            });\n            withdrawToken.safeTransfer(OwnableStorage.getOwner(), tokenFees);\n        } else {\n            revert Errors.NotEnoughFees(tokenAmount, tokenFees);\n        }\n    }\n"
print(split_code_by_function(code)[0])

    function getFees(...)
        internal
        view
        returns (uint256 feeInToken, uint256 nativeFees)
    {
@>      nativeFees = controller.getMinFees(connector, gasLimit, payloadSize);
        feeInToken = Configuration.getStaticWithdrawFee(token, connector);
    }


In [55]:
cleaned_no_sherlock_data[850]


(['  function deposit(uint256 side) external payable isInitialized nonReentrant {\n    require(!isStarted(), "DAS");\n    //@audit this check can be bypassed by frontrunning `initiatingAdminSettleDebt()`\n    // don\'t allow deposits once settle debt process has been initialized to prevent vault from starting\n    require(!isAdminSettleDebtInitialized(), "AAI");\n'],
 'Severity:  Impact:  Medium, participants will incur loss on withdrawal as admin underpaid for debt settlement. But admin (trusted) can make up the loss by refunding them the difference separately. Likelihood:  Medium, occurs when admin settles debt Description: Within LidoVault.deposit(), there is a check require(!isAdminSettleDebtInitialized(), "AAI") that prevents fixed/variable participants from starting the vault with the last deposit when the admin settle debt process has been initialized.However, the check can be bypassed due to a race condition, where an unexpected vault-starting deposit() occurs before initiating

## Data embedding

Now that the data is cleaned and split into functions and put together with all the explanations of the data, we will try to create some vector embeddings

In [75]:
from openai import OpenAI

client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [77]:
test_embedding = get_embedding(text=''.join(cleaned_no_sherlock_data[0][0]))

In [78]:
len(test_embedding)

1536

The above simple model allows us to get all the embeddings, we can now simply store these into a vector DB

#### Create documents for the RAG to embed and so LangChain can understand

1. The first version of the documents will simply have all the concatenated code as the code embedding entry
2. For the second embedding, we can try creating a document for each distinct piece of code, i.e each split function

In [59]:
with open(Path(utils.DATADIR) / "training_data.pkl", "rb") as pkl:
    other = pickle.load(pkl)

In [60]:
from langchain_core.documents import Document

def make_document(entry):
        return Document(
                page_content=''.join(entry[0]),
                metadata={"explanation": entry[1]}
        )

docs = [make_document(entry) for entry in cleaned_no_sherlock_data]

In [69]:
with open(Path(utils.DATADIR) / 'cleaned_data.txt', "w", encoding='utf-8') as f:
    for line in cleaned_no_sherlock_data:
        f.write("".join(line[0])+'\n'+"".join(line[1])+'\n')
        f.write("----------------------------------")


In [61]:
other[0]

{'text': 'unchecked {\nuint256 share = points * _PRECISION / pool.totalPoints * totalReward;\nuint256 daoShare = share * pool.daoTax / (100 * _DIVISOR);\nshare /= _PRECISION;\ndaoShare /= _PRECISION;\nreturn ((share - daoShare), daoShare);\n}\n}\n',
 'label': 'Updating a pool’s total points doesn’t affect existing stake positions for rewards calculation'}

In [70]:
other2 = [(entry["text"],entry["label"]) for entry in other]
docs += [make_document(entry) for entry in other2]

In [71]:
docs[0].metadata

{'explanation': 'Description: The Snap does not validate the origin of RPC requests, allowing any arbitrary dApp to connect to the Snap and initiate arbitrary RPC requests. Specifically, any dApp can access the privileged getToken and deleteToken RPC endpoints. Consequently, a malicious dApp could potentially extract a user’s Tezoro token from the Snap and impersonate the user in interactions with the Tezoro API. Depending on the permissions associated with this token, the implications could be critical. Example: packages/snap/src/index.ts:L14-L18packages/snap/src/index.ts:L64-L65packages/snap/src/index.ts:L34-L35 '}

In [72]:
docs[0].page_content

"export const onRpcRequest: OnRpcRequestHandler = async ({ request }) => {\n switch (request.method) {\n case 'requestAccounts': {\n const data = await ethereum.request({\n method: 'eth\\_requestAccounts',\n\ncase 'getToken': {\n const state = await snap.request({\n\ncase 'saveToken': {\n const result = await snap.request({\n\n"

In [73]:
len(docs)

2489

In [87]:
ATLAS_DB_URI="mongodb+srv://aiauditor:YJ2NwLaQOHOAHPJt@ai-auditor-prod.cwtxo73.mongodb.net/?retryWrites=true&w=majority&appName=ai-auditor-prod"

In [89]:
mongoclient = MongoClient(ATLAS_DB_URI)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.environ.get('OPENAI_API_KEY')
)

dbName = "code_snippets"
collectionName = "v1"
collection = mongoclient[dbName][collectionName]

vectorStore = MongoDBAtlasVectorSearch.from_documents(
    docs,
    embedding=embeddings,
    collection=collection,
)

In [70]:
## Test vector search
search_code = "unchecked {\nuint256 share = points * _PRECISION / pool.totalPoints * totalReward;\nuint256 daoShare = share * pool.daoTax / (100 * _DIVISOR);\nshare /= _PRECISION;\ndaoShare /= _PRECISION;\nreturn ((share - daoShare), daoShare);\n}\n}\n"
vectorSearch = MongoDBAtlasVectorSearch( collection, embeddings )
context = vectorStore.similarity_search_with_relevance_scores(search_code, k=5)


In [71]:
context

[(Document(page_content='unchecked {\nuint256 share = points * _PRECISION / pool.totalPoints * totalReward;\nuint256 daoShare = share * pool.daoTax / (100 * _DIVISOR);\nshare /= _PRECISION;\ndaoShare /= _PRECISION;\nreturn ((share - daoShare), daoShare);\n}\n}\n', metadata={'_id': ObjectId('6654c804146df6f8e1a0d672'), 'embedding': [0.024287099588522192, -0.01688334069682082, 0.02142246884813528, 0.027594574189565936, -0.015001262670932004, -0.052864228355231274, -0.017063244268982924, 0.047633161095422503, 0.0004964670833423616, -0.019208258858076366, 0.0412949897719068, -0.005518222865277082, 0.012288857528585514, -0.008081861341441152, 0.060503248629983165, 0.01154156247184784, 0.004148181307042995, -0.0586211724667394, 0.01531955518238, 0.03658979563850728, -0.01965110085608815, 0.03528895041256119, 0.0889004719618851, 0.0032901755179717443, -0.03063911036476, -0.0014020437792548492, -0.0004011090864187642, -0.03988343264947398, 0.01034450569073733, -0.018336414314774906, 0.03127569

In [69]:
context[1][0]

Document(page_content='if (!router.withdraws(transferId)) {\n    router.withdraw(\\_request, \\_sigs, \\_signers, \\_powers);\n}if (delayThreshold > 0 && wdmsg.amount > delayThreshold) {\r\n     _addDelayedTransfer(wdId, wdmsg.receiver, wdmsg.token, wdmsg. // <--- here\r\n} else {\r\n      _sendToken(wdmsg.receiver, wdmsg.token, wdmsg.\r\n}\r\n\nfunction bridgeAfterSwap(\n    uint256 amount,\n    bytes calldata bridgeData\n) external payable override {\n    CelerBridgeData memory celerBridgeData = abi.decode(\n        bridgeData,\n        (CelerBridgeData)\n    );\n\nfunction swapAndBridge(\n    uint32 swapId,\n    bytes calldata swapData,\n    StargateBridgeDataNoToken calldata stargateBridgeData\n\n', metadata={'_id': ObjectId('6654c7df146df6f8e1a0d337'), 'embedding': [0.02510660906774174, -0.009398478158217186, 0.03212906818366353, 0.035270694924361995, -0.040761940506122814, 0.012388303043333779, -0.03564029587897927, 0.02785223185862215, 0.007695664247364091, -0.016447337475372012

In [73]:
print(other[0]["text"])

unchecked {
uint256 share = points * _PRECISION / pool.totalPoints * totalReward;
uint256 daoShare = share * pool.daoTax / (100 * _DIVISOR);
share /= _PRECISION;
daoShare /= _PRECISION;
return ((share - daoShare), daoShare);
}
}



In [79]:
len(docs)

2426

In [80]:
docs

[Document(page_content="export const onRpcRequest: OnRpcRequestHandler = async ({ request }) => {\n switch (request.method) {\n case 'requestAccounts': {\n const data = await ethereum.request({\n method: 'eth\\_requestAccounts',\n\ncase 'getToken': {\n const state = await snap.request({\n\ncase 'saveToken': {\n const result = await snap.request({\n\n", metadata={'explanation': 'Description: The Snap does not validate the origin of RPC requests, allowing any arbitrary dApp to connect to the Snap and initiate arbitrary RPC requests. Specifically, any dApp can access the privileged getToken and deleteToken RPC endpoints. Consequently, a malicious dApp could potentially extract a user’s Tezoro token from the Snap and impersonate the user in interactions with the Tezoro API. Depending on the permissions associated with this token, the implications could be critical. '}),
 Document(page_content="if (assetName.startsWith('W')) {\n // Assume this is a wrapped token\n assetName = assetName.slic

In [82]:
with open(Path(utils.DATADIR) / "docs.pkl", "wb") as pkl:
    pickle.dump(docs, pkl)

## Modify Dataset so that it maps vulnerability details to recommendations

In [91]:
## Get all keys
 
keys = set(list(no_sherlock_data[0].keys()))
for data in no_sherlock_data:
    keys.update(list(data.keys()))
keys

{'Addition Overflows',
 'Affected Assets',
 'Beanstalk',
 'Beanstalk Farms',
 'Beefy',
 'Client',
 'Client response',
 'Conclusion',
 'Cyfrin',
 'Cyrin',
 'Descriptio',
 'Description',
 'Description and Recommendation',
 'Details',
 'Dexe',
 'Discussion',
 'Division Overflows',
 'Example',
 'Example: RocketNetworkPrices',
 'Examples',
 'Explanation',
 'Impact',
 'Likelihood',
 'Mitigating factors',
 'Mitigation',
 'Mitigation Review',
 'Mitigation Review 2',
 'Mitigation review',
 'Mitigation review 2',
 'Mitigations',
 'Mitigiation review',
 'Mode',
 'Multiplication Overflows',
 'Non-exhaustive Examples',
 'Note',
 'POC',
 'Pedantic Note',
 'Proof of Concept',
 'Protocol',
 'Recommendation',
 'Recommendations',
 'Recommended Mitigation',
 'Recommended mitigation',
 'References',
 'Remark',
 'Remediation',
 'Resolution',
 'RocketMinipoolBondReducer',
 'RocketNetworkPenalties',
 'Severity',
 'Solidly',
 'Solidly Labs',
 'Sudoswap',
 'Swell',
 'Team Response',
 'Team response',
 'Wormhol

In [94]:
rem_keys = {
 'Discussion',
 'Mitigating factors',
 'Mitigation',
 'Mitigation Review',
 'Mitigation Review 2',
 'Mitigation review',
 'Mitigation review 2',
 'Mitigations',
 'Mitigiation review',
 'Recommendation',
 'Recommendations',
 'Recommended Mitigation',
 'Recommended mitigation',
 'Remediation',
 'Resolution',
}

rem_keys = set([k.lower() for k in rem_keys])

In [112]:
def get_cleaned_remediations(entry, exclude_keys=[]):
    text = ''
    rem_keys
    
    for k in entry:
        if k.lower() in rem_keys:
            text += k + ': ' + ''.join(entry[k]) + ' '
    
    text.replace('\n', ' ')

    return text

In [113]:
cleaned_no_sherlock_remediations = []

for i,entry in enumerate(no_sherlock_data):
    if not entry['code']:
        logger.debug(entry)
        continue

    cleaned_no_sherlock_remediations.append(
        (
            get_cleaned_explanations(entry),
            get_cleaned_remediations(entry),
            split_and_combine_code(entry['code']),
        )
    )

2024-05-29 22:52:31,978 - DEBUG - 2564363785 - <module> - {'code': [], 'Resolution': ['Addressed with the following changesets: fort-major/msq@7f9cde2 and fort-major/msq@0b9f8d1 (removing whitelisted method names, only allowing  icrc1_transfer)', 'The client provided the following statement:'], 'Description': ['Identities are bound to their origin (URL). Third-party origins are outside the scope of this Snap and are therefore in a lower trust zone where it is unsure what security measures are in place to protect the dApp from impersonating the users’ wallet identity. dApps may be hosted on integrity protecting endpoints (ipfs/IC), however, this is not enforced.', 'Protected RPC functions can only be invoked by the MSQ administrative origin. User consent may not consistently be enforced on the administrative origin.', 'The administrative origin is identified by the origin URL. According to the client the dApp is hosted on an integrity protecting endpoint (IC). This already protects from

--- Logging error ---
Traceback (most recent call last):
  File "C:\Python39\lib\logging\__init__.py", line 1086, in emit
    stream.write(msg + self.terminator)
  File "C:\Python39\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u03b1' in position 127: character maps to <undefined>
Call stack:
  File "C:\Python39\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Python39\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "d:\Documents\Programming\hackathon-chainlink\auditor\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\Documents\Programming\hackathon-chainlink\auditor\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\Documents\Programming\hackathon-chainlink\auditor\li

In [117]:
cleaned_no_sherlock_remediations[0][1]

'Resolution: Addressed by tezoroproject/metamask-snap#41 Recommendation: Validate the origin of all incoming RPC requests. Specifically, restrict access to the RPC endpoints to only the Tezoro management dApp. Additionally, consider removing any endpoints that are not essential for the Snap’s functionality. For example, the getToken endpoint for extracting the API token might be unnecessary and could be removed to enhance security. '

In [118]:
from langchain_core.documents import Document

def make_document_vulnerability(entry):
        return Document(
                page_content=entry[0],
                metadata={
                        "resolution": entry[1],
                        "code": ''.join(entry[2])
                }
        )

docs_remediations = [make_document_vulnerability(entry) for entry in cleaned_no_sherlock_remediations]

In [119]:
docs_remediations[0]

Document(page_content='Description: The Snap does not validate the origin of RPC requests, allowing any arbitrary dApp to connect to the Snap and initiate arbitrary RPC requests. Specifically, any dApp can access the privileged getToken and deleteToken RPC endpoints. Consequently, a malicious dApp could potentially extract a user’s Tezoro token from the Snap and impersonate the user in interactions with the Tezoro API. Depending on the permissions associated with this token, the implications could be critical. ', metadata={'resolution': 'Resolution: Addressed by tezoroproject/metamask-snap#41 Recommendation: Validate the origin of all incoming RPC requests. Specifically, restrict access to the RPC endpoints to only the Tezoro management dApp. Additionally, consider removing any endpoints that are not essential for the Snap’s functionality. For example, the getToken endpoint for extracting the API token might be unnecessary and could be removed to enhance security. ', 'code': "export co

In [121]:
mongoclient = MongoClient(ATLAS_DB_URI)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.environ.get('OPENAI_API_KEY')
)

dbName = "vulnerability_details"
collectionName = "v1"
collection = mongoclient[dbName][collectionName]

vectorStore = MongoDBAtlasVectorSearch.from_documents(
    docs_remediations,
    embedding=embeddings,
    collection=collection,
)