## Early Warning Customer data

ID: fbc0c2a7-9b29-4ed2-a909-3e68c26bb251
Date Pulled: 2025-01-07
Data Sources:
- Okta system logs
- Crowdstrike
- TAP
- Exchange ORG details
- Active Directory

The purpose of this portion of the notebook is to examine the distribution of devices across AD data.

In [None]:
import polars as pl
import datetime as dt
import json
from typing import Any, Dict, Iterable
import os
import re
import pathlib
import zstandard as zstd
import csv
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Decompress target files
def normalize(name: str) -> str:
    name = name.lower().strip()
    name = re.sub(r'\s+', '_', name)
    name = re.sub(r'[^a-z0-9._-]', '', name)
    return name

def unzst_directory(src_dir: pathlib.Path):
    src_dir = src_dir.resolve()
    for root, _, files in os.walk(src_dir):
        root = pathlib.Path(root)
        rel = root.relative_to(src_dir)
        target_root = src_dir / rel
        target_root.mkdir(parents=True, exist_ok=True)
        for file in files:
            if file.endswith('.zst'):
                inpath = root / file
                stem = pathlib.Path(normalize(file[:-4]))
                outpath = target_root / stem
                print(f"Decompressing {inpath} → {outpath}")
                with open(inpath, 'rb') as ifh, open(outpath, 'wb') as ofh:
                    dctx = zstd.ZstdDecompressor()
                    dctx.copy_stream(ifh, ofh)

def dict_with_most_keys(dicts: Iterable[Dict[Any, Any]]) -> Dict[Any, Any]:
    """
    Returns the dictionary from the iterable that has the most keys.
    If multiple dictionaries are tied, returns the first one encountered.
    Raises ValueError if the iterable is empty.
    """
    try:
        # `max` with key=len chooses the dict with the largest number of keys
        return max(dicts, key=len)
    except ValueError as e:
        # This will occur if dicts is empty
        raise ValueError("The input iterable must contain at least one dictionary") from e



In [None]:
unzst_directory(pathlib.Path("/Users/eric.louhi/Github/reach-data-experiments/data/earlywarning"))


In [None]:
az_ad_data: list[dict[str, Any]] = []
with open("../data/earlywarning/azure_ad", 'r') as f:
    az_ad_data = json.load(f)

In [None]:
az_users = az_ad_data["users"]
az_users[0]

In [None]:
dict_with_most_keys(az_users)

In [None]:
# Load the list of dictionaries into a Polars DataFrame
az_user_df = pl.DataFrame(az_users)

# Explode the array field to individual rows, then aggregate the counts
aggregated_counts = (
    az_user_df.with_columns(pl.col("devices").list.len().alias("device_count"))  # length of each list
    .group_by("device_count")                                       # group by that length
    .agg(pl.count("id").alias("user_count"))                     # count IDs per length
    .sort(["user_count"], descending=True)
)
filtered_az_user_df = aggregated_counts.filter(pl.col("device_count")>=1)
filtered_az_user_df.write_csv("../data/earlywarning/ad-users-devices.csv")
print(filtered_az_user_df)


In [None]:
# hist_chart
chart = filtered_az_user_df["device_count"].plot.hist()
chart

In [None]:
print(aggregated_counts)

In [None]:
az_users
# pl.col("devices").list.len().alias("device_count")
filtered_az_user_device_cnt_df = az_user_df.filter(pl.col("devices").list.len()==339),


In [None]:
filtered_az_user_device_cnt_df

In [None]:
t_df = pl.DataFrame([x for x in az_users if len(x.get("devices",[])) > 30])

In [None]:
t_df

In [None]:
total_device_cnt = sum([len(x) for x in az_users if len(x.get("devices",[])) > 0])
total_device_cnt