In [None]:
import transformers
from transformers import BloomForCausalLM
from transformers import BloomForTokenClassification
from transformers import BloomForTokenClassification
from transformers import BloomTokenizerFast
import torch
from transformers import pipeline
import tensorflow as tf

# Bloom for Causal Language Modeling

In [None]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

In [None]:

prompt = 'Given the question delimited by triple backticks ```{ Tell me about some CVEs for Trojan Horses  }```, what is the answer? Answer:'

result_length = 50
inputs = tokenizer(prompt, return_tensors="pt")

In [None]:
#outputs = model(**inputs, labels=inputs["input_ids"])
#loss = outputs.loss
#logits = outputs.logits

## Attempting to Fine-Tuning Given Our Data


In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("C:/Users/Andrew/Downloads/mitre (1).sqlite")

# Execute SQL query to fetch CVE number and description
query = "SELECT cve_number, description FROM MITRE LIMIT 1000 OFFSET (SELECT COUNT(*) FROM MITRE) - 100000"
df = pd.read_sql_query(query, conn)

# Convert CVE number and description to lowercase
df['cve_number'] = df['cve_number'].str.lower()
df['description'] = df['description'].str.lower()

# Close the connection
conn.close()

# Display the DataFrame
print(df)

In [None]:
# Function to check if a string can be decoded in UTF-8
def is_utf8(s):
    try:
        s.decode('utf-8')
        return True
    except UnicodeDecodeError:
        return False

# Iterate over each cell in the DataFrame
for column in df.columns:
    df[column] = df[column].apply(lambda x: x if is_utf8(x.encode('utf-8')) else None)

# Drop rows with NaN values
df.dropna(inplace=True)

# Display the DataFrame after dropping rows with unreadable text
print(df)

In [None]:
pd.options.display.max_rows = 4000

In [None]:
import json


df['prompt'] =  "Given the question delimited by triple backticks ```{" + df["description"] + "}```, what is the answer? Answer: {" + df["cve_number"] + "}" 
print(df.head())

df_2 = pd.DataFrame({'text': df['prompt']})
#print(df_2['text'])

result = df_2.to_json(orient="records")
print(result[0:1000])

with open('result.json', 'w') as f:
    f.write(result)

In [None]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

# Loading bloomz model and tokenizer 
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cpu")

#dataset = load_dataset("json", data_files="C:/Users/Andrew/Downloads/result.json")
with open("C:/Users/Andrew/Downloads/result.json", "r") as file:
    dataset = json.load(file)

print(json.dumps(dataset, indent =2))

In [None]:
# prepare the data for training
dataset = load_dataset("json", data_files="C:/Users/Andrew/result.json")

def prepare_train_data(data):
    # prompt + completion
    text_input = data['text']
    # tokenize the input (prompt + completion) text
    max_length = 1500
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding='max_length', max_length=max_length)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input

train_dataset = dataset['train'].map(prepare_train_data, 
                                     batched=True, 
                                     remove_columns=["text"])

In [None]:
import os
# Set CUDA memory management options
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_merge_size=8,max_split_size_mb=8"


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
training_arguments = TrainingArguments(
    'Purdue-bloom-560m',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    fp16=True,
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_accumulation_steps=1
)


trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset
)

print(trainer)
trainer.train()
trainer.save_model()

In [None]:
import torch
from transformers import pipeline
from transformers import BloomTokenizerFast, BloomForCausalLM

tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("Purdue-bloom-560m",
                                          low_cpu_mem_usage=True).to("cpu")
prompt = 'Given the question delimited by triple backticks ```{What is the CVE number for a denial of service vulnerability}```, what is the answer? Answer:'
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, do_sample=False)
result = generator(prompt, max_length=45)
print(result)

In [None]:
import torch
from transformers import pipeline
from transformers import BloomTokenizerFast, BloomForCausalLM

tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m",
                                          low_cpu_mem_usage=True).to("cpu")
prompt = 'Given the question delimited by triple backticks ```{What is the CVE number for a denial of service vulnerability}```, what is the answer? Answer:'
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, do_sample=False)
result = generator(prompt, max_length=45)
print(result)

# Pseudo SQL RAG Implementation

In [1]:
from langchain.utilities import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
import pandas as pd

from transformers import BloomForCausalLM
from transformers import BloomForTokenClassification
from transformers import BloomForTokenClassification
from transformers import BloomTokenizerFast
import torch
from transformers import pipeline
import tensorflow as tf
import keras




In [2]:
import pyodbc
#Add your own SQL Server IP address, PORT, UID, PWD and Database
conn = pyodbc.connect(
    'DRIVER={PostgreSQL Unicode};SERVER=localhost;PORT=5432;DATABASE=postgres;UID=postgres;PWD=mysecretpassword', autocommit=True)
cur = conn.cursor()

# Update rows with empty attributes in Vulnerabilities table
cur.execute("UPDATE Vulnerabilities SET vulnerability_id = 'None' WHERE vulnerability_id = ''")
cur.execute("UPDATE Vulnerabilities SET description = 'None' WHERE description = ''")
cur.execute("UPDATE Vulnerabilities SET severity = 'None' WHERE severity = ''")
cur.execute("UPDATE Vulnerabilities SET required_action = 'None' WHERE required_action = ''")

# Update rows with empty attributes in AffectedProducts table
cur.execute("UPDATE AffectedProducts SET vulnerability_id = 'None' WHERE vulnerability_id = ''")
cur.execute("UPDATE AffectedProducts SET product_name = 'None' WHERE product_name = ''")
cur.execute("UPDATE AffectedProducts SET version = 'None' WHERE version = ''")

# Update rows with empty attributes in ReferenceData table
cur.execute("UPDATE ReferenceData SET vulnerability_id = 'None' WHERE vulnerability_id = ''")
cur.execute("UPDATE ReferenceData SET url = 'None' WHERE url = ''")
cur.execute("UPDATE ReferenceData SET description = 'None' WHERE description = ''")

cur.execute("ALTER TABLE Vulnerabilities ALTER COLUMN published_date TYPE TEXT")
#cur.execute("UPDATE Vulnerabilities SET published_date = TO_CHAR(published_date, 'YYYY-MM-DD') WHERE published_date IS NOT NULL")



In [3]:
import pyodbc
#Add your own SQL Server IP address, PORT, UID, PWD and Database
conn = pyodbc.connect(
    'DRIVER={PostgreSQL Unicode};SERVER=localhost;PORT=5432;DATABASE=postgres;UID=postgres;PWD=mysecretpassword', autocommit=True)
cur = conn.cursor()

cur.execute("SELECT * FROM Vulnerabilities WHERE published_date > '2024-01-01' LIMIT 100")
rows = cur.fetchall()
for row in rows:
    print(row)
    print("\n\n\n")

('CVE-2018-15133', 'Laravel Framework contains a deserialization of untrusted data vulnerability, allowing for remote command execution. This vulnerability may only be exploited if a malicious user has accessed the application encryption key (APP_KEY environment variable).', '2024-01-16', '6.8', 'Apply mitigations per vendor instructions or discontinue use of the product if mitigations are unavailable.')




('CVE-2020-3259', 'Cisco Adaptive Security Appliance (ASA) and Firepower Threat Defense (FTD) contain an information disclosure vulnerability. An attacker could retrieve memory contents on an affected device, which could lead to the disclosure of confidential information due to a buffer tracking issue when the software parses invalid URLs that are requested from the web services interface. This vulnerability affects only specific AnyConnect and WebVPN configurations.', '2024-02-15', '5.0', 'Apply mitigations per vendor instructions or discontinue use of the product if mitigations a

In [4]:
from langchain_community.utilities import SQLDatabase
db = SQLDatabase.from_uri("postgresql://postgres:mysecretpassword@localhost")


In [5]:

#tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
#model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

In [None]:
import torch
import time
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain_experimental.sql import SQLDatabaseSequentialChain
from langchain.chains import create_sql_query_chain
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import notebook_login
from transformers import LlamaForCausalLM, LlamaTokenizer
 
#from transformers import BloomTokenizerFast
#tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom", add_prefix_space=True, is_split_into_words=True)

#tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
token = "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"


#CHANGE THE BELOW LINE WITH WHATEVER MODEL YOU FIND ON HUGGINGFACE OR WHEREVER
model_id="meta-llama/Llama-2-7b-chat-hf"  # "bigscience/bloomz-560m"

#use_fast=False comes from https://github.com/langchain-ai/langchain/discussions/18192 attempting to fix an error (Rust vs. Python)
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)


model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100
)
llm = HuggingFacePipeline(pipeline=pipe)

#instruct_pipeline = pipeline(model="meta-llama/Llama-2-70b-chat-hf",token = token, trust_remote_code=True, use_auth_token=True, use_fast=False, device_map="auto", return_full_text=True, do_sample=False, max_new_tokens=128)
#hf_pipe = HuggingFacePipeline(pipeline=instruct_pipeline)
#chain = SQLDatabaseSequentialChain.from_llm(llm=hf_pipe, db=db, verbose=True)
chain = create_sql_query_chain(llm=llm, db=db)
chain.invoke({"question": "How many new CVES since January of 2024?"})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
import torch
import time
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain_experimental.sql import SQLDatabaseSequentialChain
from langchain.chains import create_sql_query_chain
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")


instruct_pipeline = pipeline(model="meta-llama/Llama-2-7b-chat-hf", use_fast=False, trust_remote_code=True, device_map="auto", return_full_text=True, do_sample=False, max_new_tokens=128)
hf_pipe = HuggingFacePipeline(pipeline=instruct_pipeline)
#chain = SQLDatabaseSequentialChain.from_llm(llm=hf_pipe, db=db, verbose=True)
chain = create_sql_query_chain(llm=hf_pipe, db=db)
chain.invoke({"question":"How many new CVES since January of 2024?"})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



