<a href="https://colab.research.google.com/github/dorinhazan/FinalProject-DataScience/blob/main/classification_grouping_types.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import openai
import zipfile
import json
import os

In [None]:
!pip install openai

In [None]:
!pip install --upgrade openai

In [None]:
openai.api_key = ""

In [None]:
# 1. Update these paths
zip_path        = '/content/json_results.zip'
extract_dir     = 'extracted_jsons'
merged_json_path = 'merged_results.json'

# 2. Extract JSON files
with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(extract_dir)

# 3. Merge into one dictionary, extending duplicates’ observables
merged = {}
for filename in os.listdir(extract_dir):
    # only process .json files
    if not filename.lower().endswith('.json'):
        continue

    full_path = os.path.join(extract_dir, filename)
    with open(full_path, 'r') as f:
        data = json.load(f)

    for md_name, doc in data.items():
        if md_name not in merged:
            merged[md_name] = doc
        else:
            # extend observables list from both
            merged[md_name]["observables"].extend(doc.get("observables", []))
            # you can merge other fields here if needed

# 4. Save the merged JSON
with open(merged_json_path, 'w') as f:
    json.dump(merged, f, indent=2)

In [None]:
# Extract, lowercase, and dedupe classification values
unique_classifications = {
    obs['classification'].lower()
    for doc in merged.values()
    for obs in doc.get('observables', [])
    if 'classification' in obs
}

In [2]:
gross_classifications = set([
"System",
"Module",
"Hardware",
"Device",
"PLC",
"POU",
"Identifier",
"Information",
"Credential",
"Data",
"Command",
"API",
"Code",
"Process",
"Configuration",
"Log",
"File",
"Payload",
"Software",
"Service",
"Firmware",
"Server",
"Communication",
"Protocol",
"Protocol Field",
"Connection Port",
"Vulnerability",
"Attack Technique"])

In [None]:
prompt = f"""
You are a cybersecurity analyst expert. Your job is to group a list of observable “classification” labels under gross classifications from MITRE ATT&CK® Enterprise, or if not matched to any gross classification - create a new category of gross classification.

---
### Input
   - A set of unique observable classifications:
     {sorted(unique_classifications)}

   - A set of gross classifications from MITRE ATT&CK® Enterprise:
     {sorted(gross_classifications)}

---
### Task
   - For each unique observable classifications, choose exactly one gross classification it best fits.
   - If it does not clearly belong to any of the gross classification labels given, assign it to a new category of your own choosing, appending “– new” to that category name (e.g. “Data Management – new”).

---
### Output
   - Return **only** a single JSON object.
   - Each key must be either one of the 28 gross classification label names or one of your “– new” categories.
   - Each value must be an array of the unique observable classifications assigned to that key.

---
### Response format (return *only* this JSON)

```json
{
  "Code": [
    "script file"
  ],
  "Command": [
    "command string"
  ],
  "Identifier": [
    "ip address",
    "domain"
  ],
  "Data": [
    "file format",
    "file signature"
  ]
}

"""

In [None]:
!pip install openai==0.28

In [None]:
# 4. Call ChatGPT with model o3-mini-high
response = openai.ChatCompletion.create(
    model="o3-mini-2025-01-31",
    messages=[
        {"role": "system", "content": "You are a helpful cybersecurity expert."},
        {"role": "user", "content": prompt}
    ],
)

# 5. Print the JSON response
print(response.choices[0].message.content)