In [None]:
import polars as pl
import datetime as dt
from io import StringIO
import json
from typing import Any, Dict, Iterable
import os
import re
import pathlib
import shutil
import zstandard as zstd
import csv
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from rapidfuzz import fuzz, distance  # well-maintained Levenshtein scoring
from jaro import jaro_winkler_metric
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

In [None]:
import csv

with open('../data/csvs/data_mappings.csv', 'r') as f:
    data_mapping_csv = csv.DictReader(f)
    # for row in reader:
    #     print(row)  # each row is a list of strings
#
# with open("../data/csvs/data_mappings.csv", "r") as f:
#     data_mapping_csv=csv.reader(f.read())
data_mapping_csv

In [None]:
import boto3
import re
from urllib.parse import urlparse
from botocore.exceptions import BotoCoreError, ClientError
print(boto3.Session().get_credentials().get_frozen_credentials())

s3_okta = ["s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs_1.zst",
"s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs_2.zst","s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs.zst"]
def parse_s3_url(s3_url: str) -> tuple[str, str]:
    parsed = urlparse(s3_url)
    if parsed.scheme != "s3" or not parsed.netloc or not parsed.path:
        raise ValueError(f"Invalid S3 URL format: {s3_url}")
    bucket = parsed.netloc
    key = parsed.path.lstrip("/")
    return bucket, key

def download_s3_url_object(s3_url: str, destination_path: str, region: str = "us-west-1") -> bool:
    try:
        bucket, key = parse_s3_url(s3_url)
        s3_client = boto3.client("s3", region_name=region)
        s3_client.download_file(Bucket=bucket, Key=key, Filename=destination_path)
        print(f"Downloaded '{key}' from '{bucket}' to '{destination_path}'")
        return True
    except (ValueError, ClientError, BotoCoreError) as e:
        print(f"[ERROR] {type(e).__name__}: {e}")
    except Exception as e:
        print(f"[UNEXPECTED ERROR] {e}")
    return False
data_list = []
for s in s3_okta:
    data_list.append(download_s3_url_object(s, '../data/test/'))

#### Load Data:
S3 - 48508c3f-af34-4272-913f-331cb2b3db4b
- https://us-west-1.console.aws.amazon.com/s3/buckets/reach-sandbox1?region=us-west-1&bucketType=general&prefix=customers%2F48508c3f-af34-4272-913f-331cb2b3db4b%2F&showversions=false

*"...Also found out Caesars (the hotel/casino) runs BloodHound... Their Okta and CrowdStrike are still connected. AD has been disconnected for a while so it'll be a little out of date. Did a POV with them last year and all of Vegas ended up having budget issues, finally coming back to them and Wynn." - Colt*

We will use the date 2024-11-20 for each data source, crowdstrike data is a non-empty JSON that has only keys all the way back to 2024-10-10.

`aws s3 cp s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs.zst ./ --region us-west-1`

Okta:
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs_1.zst
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs_2.zst
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/okta/system_logs/2024-11-20/okta_logs.zst

TAP:
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/tap/attacks/2024/11-20/users.zst

MS Exchange:
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/exchange_org_details/2024-11-20.zs

Azure AD:
- s3://reach-sandbox1/customers/48508c3f-af34-4272-913f-331cb2b3db4b/ad/azure/2024-11-20/azure_ad.zst

Customer data source: https://us-west-1.console.aws.amazon.com/s3/buckets/reach-sandbox1?bucketType=general&prefix=customers%2F297ede4a-3dc4-4571-8aed-93f7813fccc1%2F&region=us-west-1&tab=objects

In [None]:
# Decompress target files
def normalize(name: str) -> str:
    name = name.lower().strip()
    name = re.sub(r'\s+', '_', name)
    name = re.sub(r'[^a-z0-9._-]', '', name)
    return name

def unzst_directory(src_dir: pathlib.Path):
    src_dir = src_dir.resolve()
    for root, _, files in os.walk(src_dir):
        root = pathlib.Path(root)
        rel = root.relative_to(src_dir)
        target_root = src_dir / rel
        target_root.mkdir(parents=True, exist_ok=True)
        for file in files:
            if file.endswith('.zst'):
                inpath = root / file
                stem = pathlib.Path(normalize(file[:-4]))
                outpath = target_root / stem
                print(f"Decompressing {inpath} → {outpath}")
                with open(inpath, 'rb') as ifh, open(outpath, 'wb') as ofh:
                    dctx = zstd.ZstdDecompressor()
                    dctx.copy_stream(ifh, ofh)

def dict_with_most_keys(dicts: Iterable[Dict[Any, Any]]) -> Dict[Any, Any]:
    """
    Returns the dictionary from the iterable that has the most keys.
    If multiple dictionaries are tied, returns the first one encountered.
    Raises ValueError if the iterable is empty.
    """
    try:
        # `max` with key=len chooses the dict with the largest number of keys
        return max(dicts, key=len)
    except ValueError as e:
        # This will occur if dicts is empty
        raise ValueError("The input iterable must contain at least one dictionary") from e



In [None]:
unzst_directory(pathlib.Path("/data/customers/caesars"))

##### Okta Data

In [None]:
okta_data = None #System logs
okta_data_1 = None #System logs
okta_data_2 = None #System logs
okta_classic_config_data = None
okta_classic_group_data = None

In [None]:
# These are Okta System Logs:https://developer.okta.com/docs/api/openapi/okta-management/management/tag/SystemLog/
with open("../data/customers/caesars/okta/2024-11-20_okta_logs_1.json", 'r') as f:
    okta_data = json.load(f)

In [None]:
print(json.dumps(okta_data[0], indent=2))
print(len(okta_data))

In [None]:
with open("../data/customers/caesars/okta/2024-11-20_okta_logs_1.json", 'r') as f:
    okta_data_1 = json.load(f)

In [None]:
print(json.dumps(okta_data_1[0], indent=2))
print(len(okta_data_1))

In [None]:
with open("../data/customers/caesars/okta/2024-11-20_okta_logs_1.json", 'r') as f:
    okta_data_2 = json.load(f)

In [None]:
print(json.dumps(okta_data_2[0], indent=2))
print(len(okta_data_2))

In [None]:
with open("../data/customers/caesars/okta/2024-11-20_classic_okta_configs.json", 'r') as f:
    okta_classic_config_data = json.load(f)

In [None]:
print(okta_classic_config_data.keys())
print(list(okta_classic_config_data.get("devices").keys())[0:5])
print(len(okta_classic_config_data.get("devices")))

In [None]:
with open("../data/customers/caesars/okta/2024-11-20_classic_okta_groups.json", 'r') as f:
    okta_classic_group_data = json.load(f)

In [None]:
print(okta_classic_group_data.keys())
print(list(okta_classic_group_data.get("Everyone").keys())[0:5])
print(len(okta_classic_group_data.get("Everyone")))


In [None]:
# Combine okta system logs
okta_system_logs = okta_data + okta_data_1 + okta_data_2
print(len(okta_system_logs))

#### Join okta log data #1---unsure as to what the difference is yet---on AD USER data
Multiple ID fields for Okta JSON log data:
    - uuid = is the even ID, generated by okta to identify an event
    - actor.id = (initiator ID) a 20-character string beginning with `00u` for the system, or user that triggered the event
    - actor.alternateId = can contain username, or email
    - target.id = Generated by okta
    - client.id = client/application identifier OAuth client, API token, or agent. Okta internal ID
    - transaction.id = group events that occur together in the same flow
    - externalSessionId = correlates events within the same okta session

Name fields:
    - displayName
    - alternateId

The IDs generated by Okta will likely not help correlation of the data, username, IP, email, computer name if not none,

Links:
- ipChans - https://www.obsidiansecurity.com/blog/how-to-use-client-ip-addresses-in-okta-audit-logs

Example Device data from AD:
```JSON
{
    "displayName": "Kendell Douglas",
    "mailNickname": "kdouglas1",
    "givenName": "Kendell",
    "surname": "Douglas",
    "jobTitle": "Retail Planning Manager",
    "department": "CLV_HEC: 08800_CES 9IF_CEI MERCHANDISING ADMIN",
    "mail": "kdouglas1@caesars.com",
    "otherMails": [
        "kdouglas1@lvrio.harrahs.com"
    ],
...
    "devices": [
       {
        "id": "d357df7c-ab45-4f00-a65b-ec9ec751638a",
        "displayName": "LVR053335284353",
        "operatingSystem": "Windows",
        "operatingSystemVersion": "10.0.16299.0",
        "profileType": "RegisteredDevice",
        "accountEnabled": true,
        "approximateLastSignInDateTime": "2020-03-17T17:51:55Z",
        "createdDateTime": "2019-01-30T19:58:32Z",
        "deviceId": "cd926deb-8e85-4628-850d-34ba6f485856",
        "deviceVersion": 2,
        "registrationDateTime": "2019-01-30T11:58:32Z",
        "trustType": "Workplace"
        }
      ]
}
```

Okta security context:
```JSON
 'securityContext': {'asNumber': 394089,
  'asOrg': 'palo alto networks  inc',
  'isp': 'google',
  'domain': None,
  'isProxy': False}
```

In [None]:
okta_data[0].get("securityContext").keys()

In [None]:
n=0
for idx, i in enumerate(okta_data):
    if i.get("device"):
        print(idx)
        n=idx
        break

In [None]:
okta_alternateId = okta_data[n].get("actor").get("alternateId")
okta_displayname = okta_data[n].get("actor").get("displayName")
okta_client = okta_data[n].get("client")
okta_event_publish_time = okta_data[n].get("published")
okta_transaction_data = okta_data[n].get("transaction")
okta_request_data = okta_data[n].get("request")
okta_security_context_data = okta_data[n].get("securityContext")
okta_device = okta_data[n].get("device")
okta_target = okta_data[n].get("target")
okta_data[n]

In [None]:
okta_data_list_df = pl.DataFrame(
    [
        {
            "index": idx,
            "alternateId": x.get("actor", {}).get("alternateId", ""),
            "displayName": x.get("actor", {}).get("displayName", ""),
            "device": ({k: v for k, v in x.get("device", {}).items() if k != "id"}
                    if isinstance(x.get("device", {}), dict)
                    else {}),
            "client": ({k: v for k, v in x.get("client", {}).items() if k != "id"}
                    if isinstance(x.get("client", {}), dict)
                    else {}),
            "securityContext": ({k: v for k, v in x.get("securityContext", {}).items() if k != "id"}
                    if isinstance(x.get("securityContext", {}), dict)
                    else {}),
            "eventType": x.get("eventType", ""),
            "eventOutcome": ({k: v for k, v in x.get("eventOutcome", {}).items() if k != "id"}
                    if isinstance(x.get("eventOutcome", {}), dict)
                    else {}),
        }
        for idx, x in enumerate(okta_data)
        # if x.get("actor", {}).get("type") == "User"
    ]
)
okta_data_list_df.head(2)

In [None]:
okta_schema = pl.Schema(
        {
            "index": pl.Int64,
            "alternateId": pl.Int64,
            "displayName": pl.String,
            "device": pl.Struct(
                {
                    "id": pl.Utf8,
                    "name": pl.Utf8,
                    "os_platform": pl.Utf8,
                    "os_version": pl.Utf8,
                    "managed": pl.Boolean,
                    "registered": pl.Boolean,
                    "device_integrator": pl.Utf8,
                    "disk_encryption_type": pl.Utf8,
                    "screen_lock_type": pl.Utf8,
                    "jailbreak": pl.Utf8,
                    "secure_hardware_present": pl.Boolean,
                }
            ),
            "client": pl.Struct(
                {
                    "userAgent": pl.Struct(
                        {
                            "rawUserAgent": pl.String,
                            "os": pl.String,
                            "browser": pl.String,
                        }
                    ),
                    "zone": pl.String,
                    "device": pl.String,
                    "id": pl.String,
                    "ipAddress": pl.String,
                    "geographicalContext": pl.Struct(
                        {
                            "city": pl.String,
                            "state": pl.String,
                            "country": pl.String,
                            "postalCode": pl.String,
                            "geolocation": pl.Struct(
                                {
                                    "lat": pl.Float64,
                                    "lon": pl.Float64,
                                }
                            ),
                        }
                    ),
                }
            ),
            "securityContext": pl.Struct(
                {
                    "securityContext.asNumber": pl.Int64,
                    "securityContext.asOrg": pl.Utf8,
                    "securityContext.isp": pl.Utf8,
                    "domain": pl.Utf8,
                    "isProxy": pl.Boolean,
                }
            ),
            "eventType": pl.String,
            "eventOutcome": pl.Struct(
                {
                    "result": pl.Utf8,
                    "reason": pl.Utf8,
                }
            ),
        }
    )

In [None]:
from polars_ds import str_fuzz, str_jw
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Replace null 'device' entries with an empty struct
device_fields = [ "device.name", "device.os_platform", "device.os_version", "device.managed",
                 "device.registered", "device.device_integrator", "device.disk_encryption_type",
                 "device.screen_lock_type", "device.jailbreak", "device.secure_hardware_present"]
client_fields = ['client.userAgent', 'client.zone', 'client.device', 'client.ipAddress', 'client.geographicalContext']
securityContext_fields = ['securityContext.asNumber', 'securityContext.asOrg', 'isecurityContext.sp', 'securityContext.domain', 'securityContext.isProxy']
eventOutcome_fields = ['eventOutcome.result', 'eventOutcome.reason']

okta_data_list_df = okta_data_list_df.with_columns(
    pl.col("alternateId").str.strip_chars('"').alias("alternateId"),
    pl.col("displayName").str.strip_chars('"').alias("displayName"),
    pl.when(pl.col("device").is_null())
      .then({field: None for field in device_fields})
      .otherwise(pl.col("device"))
      .alias("device"),
    pl.when(pl.col("client").is_null())
      .then({field: None for field in client_fields})
      .otherwise(pl.col("client"))
      .alias("client"),
    pl.when(pl.col("securityContext").is_null())
      .then({field: None for field in securityContext_fields})
      .otherwise(pl.col("securityContext"))
      .alias("securityContext"),
    pl.when(pl.col("eventOutcome").is_null())
      .then({field: None for field in eventOutcome_fields})
      .otherwise(pl.col("eventOutcome"))
      .alias("eventOutcome"),
# Add proxyAddresses list for emails
)
# okta_data_list_vec_df = okta_data_list_df.with_columns(
#     okta_altIt_vec= pl.lit(model.encode(pl.col("alternateId"), convert_to_tensor=True)).alias("okta_altId_vec")
# )


In [None]:
okta_flat_df = (okta_data_list_df.unnest("device")
    .unnest("client")
    .unnest("securityContext")
    .unnest("eventOutcome"))
print(okta_flat_df.shape)
print(okta_flat_df.schema)

In [None]:
okta_flat_df.head(5)

In [None]:
okta_data_list_vec_df = okta_data_list_df.with_columns(
    pl.col("alternateId").candle.embed_text("sentence-transformers/all-MiniLM-L6-v2").alias("okta_altId_vec"),
)

In [None]:
# # username + email + device info + domain + security context + datetime (event publish time) + proxyAddresses
n = 0
for idx, i in enumerate(ad_users):
    if i.get("devices") and len(i.get("devices")) > 0 and i.get("companyName"):
        # print(i)
        n=idx
        print(idx)
        ad_display_name = i["displayName"]
        ad_mail_nickname = i["mailNickname"]
        ad_given_name = i["givenName"]
        ad_surname = i["surname"]
        ad_email = i["mail"]
        ad_department = i["department"]
        ad_usr_principal_name = i["userPrincipalName"]
        ad_onpremis_sam_acc_name = i["onPremisesSamAccountName"]
        ad_devices = i["devices"]
        ad_proxy_addresses = i["proxyAddresses"]
        ad_onpremis_distinguished_name = i["onPremisesDistinguishedName"]
        ad_company_name = i["companyName"]
        # ad_transitive_memberships = i["transitivelyMembership"]
        break
# {'displayName': 'Kendell Douglas',
#  'mailNickname': 'kdouglas1',
#  'givenName': 'Kendell',
#  'surname': 'Douglas',
#  'jobTitle': 'Retail Planning Manager',
#  'department': 'CLV_HEC: 08800_CES 9IF_CEI MERCHANDISING ADMIN',
#  'mail': 'kdouglas1@caesars.com',
#  'otherMails': ['kdouglas1@lvrio.harrahs.com'],
# #  'proxyAddresses': ['X500:/O=Casino Windsor/OU=EXCHANGE/cn=Recipients/cn=kdouglas1',
#  'city': 'Las Vegas',
#  'state': 'NV',
#  'country': 'United States',
#  'streetAddress': '3570 S. Las Vegas Blvd',
#  'userPrincipalName': 'kdouglas1@caesars.com',
#  'onPremisesSamAccountName': 'kdouglas1',
# transitiveMemberOf
# manager@delta
ad_users[n]

In [None]:
dict_with_most_keys(ad_users)

#### String Matching

Encode the Strings:
- Pass your two strings to the model’s encode method to get their embeddings.
- Calculate Similarity: Use cosine similarity to measure how similar the embeddings are (values near 1 indicate high similarity, near 0 indicate low similarity).
- High cosine + high edit-based → genuine match (e.g., "apple" ~ "appel": cos≈0.8, lev≈91).
- High cosine but low edit similarity → likely anagram or jumbled order ("string" vs "gnirts").
- Low cosine but moderate Jaro-Winkler → prefix or truncation match ("darth vader" vs "vader": cos low, JW ≈ 0.9).
- All low → clearly unrelated.

Jaro-Winkler is very good at name matching due to shared prefixes.

##### Minimums:
- Jaro >= 80.0
- Lev >= 75.0
- vector_cosine

Consider Damerau–Levenshtein for adjacent-swapped letters (e.g., "form" → "from")

In [None]:
# # Levenshtein ratio (0–100) quantifies insertions, deletions, substitutions.
# # Jaro‑Winkler (0–1) gives extra weight to common prefixes which helps with truncated or prefix‑similar strings
#
# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)
#
# from rapidfuzz import fuzz, distance  # well-maintained Levenshtein scoring
# from jaro import jaro_winkler_metric
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# from sentence_transformers import util
#
# def compare_strings(s1: str, s2: str) -> dict:
#     """
#     Compute multiple string-similarity metrics between s1 and s2.
#
#     Returns dict with:
#       - levenshtein_ratio: normalized edit distance (0–100)
#       - jaro_winkler: Jaro-Winkler similarity (0–1)
#       - tfidf_cosine: cosine similarity on char‑ngram TF-IDF (0–1)
#       - vector_cosine: vector cosine similarity using `sentence-transformers/all-MiniLM-L6-v2`
#     """
#     # 1. Edit‐distance / Levenshtein Ratio (0–100)
#     lev_ratio = fuzz.ratio(s1, s2)
#
#     # 2. Jaro‑Winkler (0–1)
#     jw = jaro_winkler_metric(s1, s2)
#
#     # 3. Cosine similarity using TF‑IDF of char 3‑grams
#     tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3))
#     tfidf_matrix = tfidf.fit_transform([s1, s2])
#     cos = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0,0]
#     # # Example strings to match
#
#     # Encode both strings
#     embedding1 = model.encode(s1, convert_to_tensor=True)
#     embedding2 = model.encode(s2, convert_to_tensor=True)
#
#     # Calculate cosine similarity
#     vector_cosine_similarity = util.cos_sim(embedding1, embedding2)
#     return {
#         'levenshtein_ratio': lev_ratio,
#         'jaro_winkler': jw,
#         'tfidf_cosine': cos,
#         'vector_cosine': vector_cosine_similarity.item()
#     }
#
#
# string1 = ad_display_name
# # string2 = "filomena@silva"
# # tests = [(string1, string2)]
# okta_data_list = [{"displayName": x.get("displayName", []),"email": x.get("main", None), "idx": idx} for idx, x in enumerate(ad_users)]
# for a in okta_data_list:
#     string2 = a.get("displayName")
#     scores = compare_strings(string2, string1)
#     # print(f"{string2!r} ≈ {string1!r} →", scores)
#     lev, jaro, tfid, vec_cos = scores.values()
#     if vec_cos >= 0.6:
#         print(f"Vector similarity score: {vec_cos:.4f}")
#         if float(lev) >= 75.0 and float(jaro) >= 85.0 and (vec_cos) >= 0.8:
#             print(f"{string2!r} ≈ {string1!r} →", scores)
#             print("Match!")
#             print(a)
#             break
#     if a.get("email"):
#         string2 = a.get("email")
#         scores = compare_strings(string2, string1)
#         # print(f"{string2!r} ≈ {string1!r} →", scores)
#         lev, jaro, tfid, vec_cos = scores.values()
#         if vec_cos >= 0.6:
#             print(f"Vector similarity score: {vec_cos:.4f}")
#             if float(lev) >= 75.0 and float(jaro) >= 85.0 and (vec_cos) >= 0.8:
#                 print(f"{string2!r} ≈ {string1!r} →", scores)
#                 print("Match!")
#                 print(a)
#                 break
# ad_data_list_df = pl.from_records([{"vector_displayName": model.encode(x.get("displayName", ''), convert_to_tensor=True),
#                                       "displayName": x.get("displayName", None),"email": x.get("mail", None), "idx": idx} for idx, x in enumerate(ad_users[:1000])], schema=["idx", "displayName", "email", "vector_displayName"])


In [None]:
# df.select( # Column "word", compared to string in pl.lit(). It also supports column vs column comparison
#     pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("Levenshtein"),
#     pds.str_osa("word", pl.lit("apples"), return_sim=True).alias("Optimal String Alignment"),
#     pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler"),
# )

In [None]:
# https://polars-ds-extension.readthedocs.io/en/latest/string.html
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
from sentence_transformers.util import cos_sim
import polars as pl
from polars_ds import str_fuzz, str_jw
import polars_candle  # pip install polars-candle
import numpy as np
# Encode both strings
t_string = 'kdouglas1@caesars.com'
# Calculate cosine similarity
vector_target = pl.lit(model.encode(t_string, convert_to_tensor=True).cpu().numpy().flatten(), dtype=pl.List(pl.Float64))
target = pl.lit(t_string)
sim_df = okta_flat_df.with_columns(
    str_fuzz(pl.col("alternateId"), target, parallel=True).alias("lev_sim"),
    str_jw(pl.col("alternateId"), target, parallel=True).alias("jw_sim"),
    # vector_cos= pl.lit(cos_sim(pl.col("alternateId")
    #   .candle.embed_text("sentence-transformers/all-MiniLM-L6-v2"), vector_target).item()
    #   .alias("vector_cos"), dtype=pl.Float64)
)

# df = df.with_columns([
#     pl.col("displayName")
#       .candle.embed_text("sentence-transformers/all-MiniLM-L6-v2")
#       .alias("vector"),
#     pl.lit("filomena@silva")
#       .candle.embed_text("sentence-transformers/all-MiniLM-L6-v2")
#       .alias("target_vector")
# ])
# df = df.with_columns(
#     (pl.col("vector_displayName").dot(pl.col("target_vector"))
#      / (pl.col("vector").norm() * pl.col("target_vector").norm()))
#     .alias("vector_cosine")
# )

In [None]:
sim_df.sort(by="jw_sim", descending=True, nulls_last=True).head(5)

In [None]:
print(sim_df.shape, sim_df.head(5).to_dict().get("alternateId"))

In [None]:
# result_df = sim_df.with_columns(
#     contains=pl.col("alternateId").is_in([t_string])
# )

In [None]:
result_df.filter(pl.col("alternateId") == t_string)

| Feature             | Levenshtein                                    | Jaro‑Winkler                                                                                        |   |    |                      |                                                 |
| ------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------- | - | -- | -------------------- | ----------------------------------------------- |
| **Edit operations** | insertion, deletion, substitution              | matching within window, transposition                                                               |   |    |                      |                                                 |
| **Transposition**   | Not counted (unless using Damerau–Levenshtein) | Explicitly considered                                                                               |   |    |                      |                                                 |
| **Prefix emphasis** | None                                           | Yes (up to 4 chars, via Winkler bonus) ([en.wikipedia.org][1], [dev.to][2], [geeksforgeeks.org][3]) |   |    |                      |                                                 |
| **Range/Output**    | 0…∞ (distance), normalized 0–1 similarity      | 0–1 similarity                                                                                      |   |    |                      |                                                 |
| **Best for**        | Longer strings, quantifying edit count         | Short-to-medium strings (names), with typos and prefix variations                                   |   |    |                      |                                                 |

[1]: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance?utm_source=chatgpt.com "Jaro–Winkler distance"
[2]: https://dev.to/fatemasamir/unveiling-string-distances-a-dive-into-levenshtein-and-jaro-distances-in-databases-b6j?utm_source=chatgpt.com "Unveiling String Distances: A Dive into Levenshtein and Jaro ..."
[3]: https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/?utm_source=chatgpt.com "Jaro and Jaro-Winkler similarity - GeeksforGeeks"




#### MS Exchange Data

In [None]:
exchange_data = None
with open("../data/customers/caesars/2024-11-20_exchange.json", 'r') as f:
    exchange_data = json.load(f)

#### TAP Data

In [None]:
# This set of data is from the `/api/data/v1/people` API
tap_users_data = None
with open("../data/customers/caesars/2024-11-20_tap_users.json", 'r') as f:
    tap_users_data = json.load(f)

In [None]:
# tap_users_data[0:2]
dict_with_most_keys(tap_users_data)

#### Azure AD Data

In [None]:
#### Azure AD Data
azure_ad_data = None
with open("../data/customers/caesars/2024-11-20_azure_ad.json", 'r') as f:
    azure_ad_data = json.load(f)

In [None]:
ad_users = azure_ad_data.get("users", [])
# - Microsoft Entra group, a Microsoft 365 group, or a security group
# - onPremisesSecurityIdentifier field is particularly important for hybrid environments where groups are synchronized between on-premises Active Directory and Azure AD, and enables correlation between cloud and on-premises representations of the same security principal.
# - onPremisesSyncEnabled property indicates whether a group is synchronized from on-premises Active Directory,
ad_groups = azure_ad_data.get("groups", [])
ad_devices = azure_ad_data.get("devices", [])

In [None]:
t_user = dict_with_most_keys(ad_users)
print(json.dumps(dict_with_most_keys(ad_users), indent=4))

In [None]:
# Join AD user and group data by user devices
for x in ad_groups:
    if 'members@delta' in x.keys():
        for n in x.get("members@delta"):
            if n.get("id") == t_user.get("id"):
                print(x)
            if n.get("id") in [a.get("id") for a in t_user.get("devices")]:
                print(f"Found in device IDs:\n{x}")

In [None]:
# groups[1]
dict_with_most_keys(ad_groups)

#### BloodHound Data

In [None]:
bh_users_data: list[dict] = []
with open("../data/bloodhound/generated/ad_example_data/20240305110018_users.json", 'r') as f:
    bh_users_data.append(json.load(f))
with open("../data/bloodhound/generated/ad_example_data/20240305110427_users.json", 'r') as f:
    bh_users_data.append(json.load(f))
with open("../data/bloodhound/generated/ad_example_data/20240305111414_users.json", 'r') as f:
    bh_users_data.append(json.load(f))
print(len(bh_users_data))

In [None]:
len(bh_users_data[0].get("data")[0])

In [None]:
bh_users_data[0].get("data")[0]

In [None]:
bh_computers_data: list[dict] = []

with open("../data/bloodhound/generated/ad_example_data/20240305110018_computers.json", 'r') as f:
    bh_computers_data.append(json.load(f))
with open("../data/bloodhound/generated/ad_example_data/20240305110427_computers.json", 'r') as f:
    bh_computers_data.append(json.load(f))
with open("../data/bloodhound/generated/ad_example_data/20240305111414_computers.json", 'r') as f:
    bh_computers_data.append(json.load(f))
print(len(bh_computers_data))

In [None]:
print(len(bh_computers_data))

In [None]:
bh_computers_data[0].get("data")[0]

#### AD transforms to Bloodhound Data

In [None]:
#!/usr/bin/env python3.11
"""
Microsoft Graph API to BloodHound CE Group Mapper
Transforms Microsoft Graph API v1 group objects to BloodHound CE format
"""

import json
import uuid
from datetime import datetime
from typing import Dict, List, Optional, Any
import re


class GraphToBloodHoundMapper:
    """Maps Microsoft Graph API group data to BloodHound CE format"""

    def __init__(self):
        self.domain_suffix = ""
        self.collected_timestamp = datetime.utcnow().isoformat() + "Z"

    def set_domain_suffix(self, domain: str):
        """Set the domain suffix for group objects"""
        self.domain_suffix = domain.upper()

    def extract_domain_from_upn(self, upn: str) -> str:
        """Extract domain from user principal name or email"""
        if "@" in upn:
            return upn.split("@")[1].upper()
        return self.domain_suffix

    def generate_distinguished_name(self, group_name: str, domain: str) -> str:
        """Generate a distinguished name for the group"""
        # Simple DN generation - in practice, you might want to get actual DN from Graph API
        domain_components = ".".join([f"DC={part}" for part in domain.lower().split(".")])
        return f"CN={group_name},CN=Users,{domain_components}"

    def map_group_to_bloodhound(self, graph_group: Dict[str, Any]) -> Dict[str, Any]:
        """
        Map a Microsoft Graph API group object to BloodHound CE format

        Args:
            graph_group: Group object from Microsoft Graph API

        Returns:
            Dictionary in BloodHound CE group format
        """
        # Extract basic properties
        object_id = graph_group.get("id", "")
        display_name = graph_group.get("displayName", "")
        mail = graph_group.get("mail", "")
        description = graph_group.get("description", "")

        # Determine domain
        domain = self.domain_suffix
        if mail and "@" in mail:
            domain = self.extract_domain_from_upn(mail)
        elif graph_group.get("onPremisesDomainName"):
            domain = graph_group["onPremisesDomainName"].upper()

        # Security properties
        security_enabled = graph_group.get("securityEnabled", False)
        security_identifier = graph_group.get("securityIdentifier", "")
        on_premises_sid = graph_group.get("onPremisesSecurityIdentifier", "")
        is_assignable_to_role = graph_group.get("isAssignableToRole", False)

        # Use on-premises SID if available, otherwise use Azure AD SID
        primary_sid = on_premises_sid if on_premises_sid else security_identifier

        # Group types
        group_types = graph_group.get("groupTypes", [])
        is_unified_group = "Unified" in group_types

        # Mail properties
        mail_enabled = graph_group.get("mailEnabled", False)
        mail_nickname = graph_group.get("mailNickname", "")

        # Generate BloodHound group object
        bloodhound_group = {
            "ObjectIdentifier": object_id.upper(),
            "Name": f"{display_name}@{domain}",
            "Domain": domain,
            "DistinguishedName": self.generate_distinguished_name(display_name, domain),
            "Description": description,
            # "AdminCount": False,  # Would need additional logic to determine
            "Properties": {
                "name": display_name,
                "domain": domain,
                "objectid": object_id.upper(),
                "description": description,
                "distinguishedname": self.generate_distinguished_name(display_name, domain),
                "domainsid": self._extract_domain_sid(primary_sid) if primary_sid else "",
                "securityenabled": security_enabled,
                "mailnickname": mail_nickname,
                "mailenabled": mail_enabled,
                "mail": mail or "",
                "isassignabletorole": is_assignable_to_role,
                "grouptype": self._map_group_type(group_types, security_enabled),
                "onpremisessecurityidentifier": on_premises_sid,
                "azuread_objectid": object_id,
                "collected": True,
                "crossref": None
            }
        }

        # Add optional properties if they exist
        if graph_group.get("createdDateTime"):
            bloodhound_group["Properties"]["whencreated"] = graph_group["createdDateTime"]

        if graph_group.get("onPremisesLastSyncDateTime"):
            bloodhound_group["Properties"]["lastlogon"] = graph_group["onPremisesLastSyncDateTime"]

        return bloodhound_group

    def _extract_domain_sid(self, sid: str) -> str:
        """Extract domain SID from full SID"""
        if not sid:
            return ""

        # SID format: S-1-5-21-domain-domain-domain-rid
        # We want everything except the last part (RID)
        parts = sid.split("-")
        if len(parts) >= 4:
            return "-".join(parts[:-1])
        return ""

    def _map_group_type(self, group_types: List[str], security_enabled: bool) -> int:
        """Map Graph API group types to BloodHound group type integer"""
        # BloodHound group types (simplified mapping)
        if security_enabled:
            if "Unified" in group_types:
                return 0x80000000  # Universal security group with mail
            else:
                return 0x80000002  # Global security group
        else:
            if "Unified" in group_types:
                return 0x00000000  # Universal distribution group
            else:
                return 0x00000004  # Global distribution group

    def map_group_members(self, graph_group: Dict[str, Any], members_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Map group membership relationships to BloodHound format

        Args:
            graph_group: The parent group object
            members_data: List of member objects from Graph API

        Returns:
            List of membership relationship objects
        """
        relationships = []
        group_id = graph_group.get("id", "").upper()
        domain = self.domain_suffix

        if graph_group.get("mail") and "@" in graph_group["mail"]:
            domain = self.extract_domain_from_upn(graph_group["mail"])
        elif graph_group.get("onPremisesDomainName"):
            domain = graph_group["onPremisesDomainName"].upper()

        for member in members_data:
            member_id = member.get("id", "").upper()
            member_type = member.get("@odata.type", "")

            # Determine member type for BloodHound
            if "user" in member_type.lower():
                member_label = "User"
            elif "group" in member_type.lower():
                member_label = "Group"
            elif "device" in member_type.lower():
                member_label = "Computer"
            else:
                member_label = "Base"  # Default

            relationship = {
                "SourceObjectIdentifier": member_id,
                "SourceObjectType": member_label,
                "TargetObjectIdentifier": group_id,
                "TargetObjectType": "Group",
                "RelationshipType": "MemberOf"
            }

            relationships.append(relationship)

        return relationships

    def create_bloodhound_data_structure(self, groups: List[Dict[str, Any]],
                                       membership_data: Dict[str, List[Dict[str, Any]]] = None) -> Dict[str, Any]:
        """
        Create complete BloodHound data structure for ingestion

        Args:
            groups: List of Graph API group objects
            membership_data: Optional dictionary mapping group IDs to member lists

        Returns:
            Complete BloodHound data structure
        """
        bloodhound_groups = []
        relationships = []

        for group in groups:
            # Map the group
            bh_group = self.map_group_to_bloodhound(group)
            bloodhound_groups.append(bh_group)

            # Map memberships if provided
            if membership_data and group.get("id") in membership_data:
                group_relationships = self.map_group_members(group, membership_data[group["id"]])
                relationships.extend(group_relationships)

        # Create BloodHound data structure
        bloodhound_data = {
            "data": bloodhound_groups,
            "meta": {
                "type": "groups",
                "count": len(bloodhound_groups),
                "version": 6,
                "collected_by": "graph-api-mapper",
                "collected_time": self.collected_timestamp
            }
        }

        # Add relationships if any exist
        if relationships:
            bloodhound_data["relationships"] = relationships

        return bloodhound_data


mapper = GraphToBloodHoundMapper()
mapper.set_domain_suffix("CAESARS.COM")

bloodhound_data = mapper.create_bloodhound_data_structure(ad_groups)



In [None]:
print(json.dumps(bloodhound_data["data"][0], indent=2))


In [None]:
#!/usr/bin/env python3.11
"""
Microsoft Graph API to BloodHound CE User Mapper
Transforms Microsoft Graph API v1 user objects to BloodHound CE format
"""

import json
import uuid
from datetime import datetime
from typing import Dict, List, Optional, Any
import re


class GraphUserToBloodHoundMapper:
    """Maps Microsoft Graph API user data to BloodHound CE format"""

    def __init__(self):
        self.domain_suffix = ""
        self.collected_timestamp = datetime.utcnow().isoformat() + "Z"

    def set_domain_suffix(self, domain: str):
        """Set the domain suffix for user objects"""
        self.domain_suffix = domain.upper()

    def extract_domain_from_upn(self, upn: str) -> str:
        """Extract domain from user principal name or email"""
        if "@" in upn:
            return upn.split("@")[1].upper()
        return self.domain_suffix

    def generate_distinguished_name(self, user_name: str, domain: str) -> str:
        """Generate a distinguished name for the user"""
        domain_components = ".".join([f"DC={part}" for part in domain.lower().split(".")])
        return f"CN={user_name},CN=Users,{domain_components}"

    def map_user_to_bloodhound(self, graph_user: Dict[str, Any]) -> Dict[str, Any]:
        """
        Map a Microsoft Graph API user object to BloodHound CE format

        Args:
            graph_user: User object from Microsoft Graph API

        Returns:
            Dictionary in BloodHound CE user format
        """
        # Extract core properties
        object_id = graph_user.get("id", "")
        user_principal_name = graph_user.get("userPrincipalName", "")
        display_name = graph_user.get("displayName", "")
        mail = graph_user.get("mail", "")
        description = graph_user.get("description", "")

        # Determine domain
        domain = self.domain_suffix
        if user_principal_name:
            domain = self.extract_domain_from_upn(user_principal_name)
        elif graph_user.get("onPremisesDomainName"):
            domain = graph_user["onPremisesDomainName"].upper()

        # Security identifiers
        on_premises_sid = graph_user.get("onPremisesSecurityIdentifier", "")
        cloud_sid = graph_user.get("securityIdentifier", "")
        primary_sid = on_premises_sid if on_premises_sid else cloud_sid

        # Account status and password properties
        account_enabled = graph_user.get("accountEnabled", False)
        password_policies = graph_user.get("passwordPolicies", "")
        last_password_change = graph_user.get("lastPasswordChangeDateTime", "")

        # Generate BloodHound user object
        bloodhound_user = {
            "ObjectIdentifier": primary_sid.upper() if primary_sid else object_id.upper(),
            "PrimaryGroupSid": self._get_primary_group_sid(primary_sid),
            "Properties": {
                "name": user_principal_name if user_principal_name else display_name,
                "domain": domain,
                "distinguishedname": graph_user.get("onPremisesDistinguishedName")
                                   or self.generate_distinguished_name(display_name, domain),
                "domainsid": self._extract_domain_sid(primary_sid) if primary_sid else "",
                "enabled": account_enabled,
                "email": mail or "",
                "title": graph_user.get("jobTitle", ""),
                "homedirectory": "",
                "scriptpath": "",
                "profilepath": "",
                "userpassword": "",
                "admincount": False,  # Requires additional logic to determine
                "sidhistory": [],
                "passwordnotreqd": "DisablePasswordExpiration" in password_policies,
                "pwdlastset": last_password_change,
                "lastlogon": graph_user.get("signInActivity", {}).get("lastSignInDateTime", ""),
                "whencreated": graph_user.get("createdDateTime", ""),
                "badpwdcount": 0,  # Not available in Graph API
                "userprincipalname": user_principal_name,
                "serviceprincipalnames": [],
                "displayname": display_name,
                "upndomain": domain,
                "description": description,
                "trustedtoauth": False,
                "hasspn": False,
                "sensitive": False,
                "dontreqpreauth": False,
                "passwordexpired": "DisablePasswordExpiration" not in password_policies,
                "unconstraineddelegation": False,
                "reversiblepassword": False,
                "trustedfordelegation": False,
                "collected": True
            }
        }

        # Add optional properties
        if graph_user.get("employeeId"):
            bloodhound_user["Properties"]["employeeid"] = graph_user["employeeId"]

        if graph_user.get("department"):
            bloodhound_user["Properties"]["department"] = graph_user["department"]

        return bloodhound_user

    def _get_primary_group_sid(self, user_sid: str) -> str:
        """Derive primary group SID (typically Domain Users)"""
        if not user_sid:
            return ""

        # Extract domain SID and append Domain Users RID (513)
        parts = user_sid.split("-")
        if len(parts) > 5:
            domain_sid = "-".join(parts[:-1])
            return f"{domain_sid}-513"
        return ""

    def _extract_domain_sid(self, sid: str) -> str:
        """Extract domain SID from full user SID"""
        if not sid:
            return ""

        parts = sid.split("-")
        if len(parts) >= 4:
            return "-".join(parts[:-1])
        return ""

    def map_user_relationships(self, graph_user: Dict[str, Any],
                             member_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Map user relationships to BloodHound format

        Args:
            graph_user: The user object
            member_data: Dictionary containing memberOf, directReports, etc.

        Returns:
            List of relationship objects
        """
        relationships = []
        user_sid = graph_user.get("onPremisesSecurityIdentifier", "") or graph_user.get("id", "")

        # Map group memberships
        for group in member_data.get("memberOf", []):
            relationships.append({
                "SourceObjectIdentifier": user_sid,
                "SourceObjectType": "User",
                "TargetObjectIdentifier": group.get("id", ""),
                "TargetObjectType": "Group",
                "RelationshipType": "MemberOf"
            })

        # Map manager relationship
        if graph_user.get("manager", {}).get("id"):
            relationships.append({
                "SourceObjectIdentifier": user_sid,
                "SourceObjectType": "User",
                "TargetObjectIdentifier": graph_user["manager"]["id"],
                "TargetObjectType": "User",
                "RelationshipType": "ReportsTo"
            })

        # Map owned objects
        for obj in member_data.get("ownedObjects", []):
            relationships.append({
                "SourceObjectIdentifier": user_sid,
                "SourceObjectType": "User",
                "TargetObjectIdentifier": obj.get("id", ""),
                "TargetObjectType": self._get_object_type(obj),
                "RelationshipType": "Owns"
            })

        return relationships

    def _get_object_type(self, directory_object: Dict[str, Any]) -> str:
        """Determine BloodHound object type from Graph API @odata.type"""
        type_map = {
            "#microsoft.graph.user": "User",
            "#microsoft.graph.group": "Group",
            "#microsoft.graph.device": "Computer",
            "#microsoft.graph.application": "ServicePrincipal"
        }
        return type_map.get(directory_object.get("@odata.type", ""), "Base")


class BloodHoundUserIngestor:
    """Handles complete user data ingestion workflow"""

    def __init__(self):
        self.mapper = GraphUserToBloodHoundMapper()

    def process_users(self, graph_users: List[Dict[str, Any]],
                    relationship_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process complete user dataset with relationships

        Args:
            graph_users: List of Graph API user objects
            relationship_data: Dictionary of user IDs to relationship data

        Returns:
            BloodHound CE compatible JSON structure
        """
        bloodhound_users = []
        relationships = []

        for user in graph_users:
            # Map user object
            bh_user = self.mapper.map_user_to_bloodhound(user)
            bloodhound_users.append(bh_user)

            # Map relationships
            user_id = user.get("id", "")
            if user_id in relationship_data:
                user_rels = self.mapper.map_user_relationships(user, relationship_data[user_id])
                relationships.extend(user_rels)

        return {
            "data": bloodhound_users,
            "meta": {
                "type": "users",
                "count": len(bloodhound_users),
                "version": 6,
                "collected_by": "graph-api-user-mapper",
                "collected_time": self.mapper.collected_timestamp
            },
            "relationships": relationships
        }

    # Sample relationship data
    sample_relationships = {
        "a1b2c3d4-e5f6-7g8h-9i0j-k1l2m3n4o5p6": {
            "memberOf": [{"id": "g1g2g3g4-h5h6-i7i8-j9j0-k1k2k3k4k5k6", "@odata.type": "#microsoft.graph.group"}],
            "manager": {"id": "m1m2m3m4-n5n6-o7o8-p9p0-q1q2q3q4q5q6", "@odata.type": "#microsoft.graph.user"}
        }
    }

    # Process data
    ingestor = BloodHoundUserIngestor()
    ingestor.mapper.set_domain_suffix("CAESARS.COM")
    bh_data = ingestor.process_users([sample_user], sample_relationships)

    # Output result
    print(json.dumps(bh_data, indent=2))
