In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import google.generativeai as genai
from google.colab import userdata
from tqdm import tqdm

def scrape_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Scrape paragraphs
    paragraph_elements = soup.find_all(class_="description-body")
    paragraphs = [element.get_text(strip=True) for element in paragraph_elements]

    # Scrape tables
    tables = []
    table_classes = ["tables-mobile", "table datasources-table table-bordered"]
    for table_class in table_classes:
        table_elements = soup.find_all(class_=table_class)
        for table_element in table_elements:
            table = []
            rows = table_element.find_all("tr")

            # Find the index of the "ID" column in the header row
            headers = [th.get_text(strip=True) for th in rows[0].find_all("th")]
            id_index = headers.index("ID") if "ID" in headers else -1

            for row in rows:
                cols = row.find_all(["td", "th"])
                if id_index != -1:
                    # Exclude the "ID" column
                    cols = [col.get_text(strip=True) for idx, col in enumerate(cols) if idx != id_index]
                else:
                    cols = [col.get_text(strip=True) for col in cols]
                table.append(cols)
            tables.append(table)

    return paragraphs, tables

def extract_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find('div', class_='matrix-container p-3')
    links = container.find_all('a', href=True)
    extracted_links = [(link.text, link['href']) for link in links]
    return extracted_links

# URL of the main page
main_url = 'https://attack.mitre.org/'

# Extract links from the main page
extracted_links = extract_links(main_url)

all_paragraphs = []
all_tables = []

# Scrape content from each extracted link
for text, href in tqdm(extracted_links, desc="Scraping Content"):
    url = f"https://attack.mitre.org{href}"  # Assuming the href is a relative URL
    paragraphs, tables = scrape_content(url)

    # Store the scraped data
    all_paragraphs.append({
        "url": url,
        "paragraphs": paragraphs
    })
    all_tables.append({
        "url": url,
        "tables": tables
    })

# Generate questions and save to TSV
gemini_key = userdata.get('m')

genai.configure(api_key=gemini_key)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

with open('CTI_MCQ.tsv', 'a') as file:
    file.write('URL' + '\t' + 'Question' + '\t' + 'Option A' + '\t' + 'Option B' + '\t' + 'Option C' + '\t' + 'Option D' + '\t' + 'GT' + '\t' + 'Prompt' + '\n')

    for entry in tqdm(all_paragraphs, desc="Processing URLs"):
        url = entry["url"]
        paragraphs = entry["paragraphs"]
        tables = next(item["tables"] for item in all_tables if item["url"] == url)

        content = " ".join(paragraphs) + " " + " ".join([" ".join(row) for table in tables for row in table])

        if len(content) > 0:
            prompt = f"""You are a cybersecurity expert specializing in cyber threat intelligence. Given the text below, please
            generate a maximum of 5 multiple-choice questions with four possible options each.
            Follow these requirements:
            1. Question Format: Each question must have four options. The options should be challenging and require
            careful consideration. Avoid creating options that could be interpreted as correct under different circumstances.
            2. Target Audience: The questions should be suitable for security professionals with three to five years of
            experience in cyber threat intelligence. Avoid generic questions such as “What is the objective?”, “Which
            operating system can be targeted?”.
            3. Content Coverage: Aim to cover various sections of the document to ensure a comprehensive evaluation of
            the candidate’s knowledge. Include context-specific questions that require an understanding of the document’s
            content.
            4. Technical Precision: Use precise terminology and concepts relevant to cyber threat intelligence. Incorporate
            situational or scenario-based questions where applicable.
            5. Include Technique IDs and Names: Ensure that all questions, where applicable, mention both the ID and the
            full name of the MITRE ATT&CK pattern technique.
            6. Premise Inclusion: Each question should include a premise indicating it pertains to MITRE ATT&CK,
            specifying the relevant platform (Enterprise, ICS, or Mobile) where necessary.
            7. Output Format: Return the output in TSV format (must be tab-separated) with the following columns:
            Question, Option A, Option B, Option C, Option D, Correct Answer should be one letter (A, B, C, D).
            Important: Only return the TSV (tab separator \t) content as specified. Do not include any additional text or commentary outside the TSV format.
            """ + content

            response = model.generate_content(prompt, safety_settings="BLOCK_NONE")

            for line in response.text.strip().split('\n')[1:]:
                options = line.split('\t')
                if len(options) == 6:
                    question, option_a, option_b, option_c, option_d, correct_answer = options

                    question_prompt = f"""You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.
                    **Question:** {question}
                    **Options:**
                    A) {option_a}
                    B) {option_b}
                    C) {option_c}
                    D) {option_d}
                    **Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text."""
                    file.write(url + '\t' + question + '\t' + option_a + '\t' + option_b + '\t' + option_c + '\t' + option_d + '\t' + correct_answer + '\t' + question_prompt.replace('  ', '').replace('\n', ' ').replace('\t', '') + '\n')

print("TSV file created successfully.")
