In [9]:
%run ../scripts/generate_healthcare_logs.py


healthcare_logs.csv with 'Role' column successfully generated.


In [13]:
import pandas as pd
df = pd.read_csv("../data/healthcare_logs.csv")
df.head()


Unnamed: 0,UserID,Timestamp,DiagnosisCode,Country,ConsentStatus,Role
0,U001,2024-01-01T14:00:00Z,K21.0,NL,True,researcher
1,U002,2024-01-13T09:00:00Z,E11.9,IT,True,external_vendor
2,U003,2024-02-12T10:00:00Z,K21.0,FR,False,clinician
3,U004,2024-02-15T04:00:00Z,K21.0,PL,True,ai_service
4,U005,2024-01-18T12:00:00Z,J45.9,FR,True,researcher


In [14]:
# Feature 1: Non-EU Access Flag
# Mark rows where the country is outside of EU

eu_countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI',
                'FR', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT',
                'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK']

df["non_eu_flag"] = ~df["Country"].isin(eu_countries)
df[["Country", "non_eu_flag"]].head()


Unnamed: 0,Country,non_eu_flag
0,NL,False
1,IT,False
2,FR,False
3,PL,False
4,FR,False


In [6]:
# Feature 2: Consent Violation Flag
# Mark rows where consent is missing or explicitly set to false

df["consent_violation_flag"] = df["ConsentStatus"] == False
df[["ConsentStatus", "consent_violation_flag"]].head()


Unnamed: 0,ConsentStatus,consent_violation_flag
0,True,False
1,False,True
2,True,False


In [15]:
# Feature 3: Admin Access Outside Business Hours
# Detect admin logins outside of expected working hours (e.g. 22:00–06:00)

# Convert to datetime
df["timestamp"] = pd.to_datetime(df["Timestamp"], errors='coerce')
df["hour"] = df["timestamp"].dt.hour

# Define the flag
df["admin_access_outside_hours"] = (df["Role"] == "admin") & ((df["hour"] < 6) | (df["hour"] > 22))

df[["Role", "hour", "admin_access_outside_hours"]].head()


Unnamed: 0,Role,hour,admin_access_outside_hours
0,researcher,14,False
1,external_vendor,9,False
2,clinician,10,False
3,ai_service,4,False
4,researcher,12,False


In [20]:
# 1. non_eu_flag
eu_countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI',
                'FR', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT',
                'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK']
df["non_eu_flag"] = ~df["Country"].isin(eu_countries)

# 2. consent_violation_flag
df["consent_violation_flag"] = df["ConsentStatus"] == False

# 3. admin_access_outside_hours
df["timestamp"] = pd.to_datetime(df["Timestamp"], errors='coerce')
df["hour"] = df["timestamp"].dt.hour
df["admin_access_outside_hours"] = (df["Role"] == "admin") & ((df["hour"] < 6) | (df["hour"] > 22))


In [21]:
# Feature 4: Risk Score (0–3)
# Additive scoring based on critical flag combinations

df["risk_score"] = (
    df["non_eu_flag"].astype(int) +
    df["consent_violation_flag"].astype(int) +
    df["admin_access_outside_hours"].astype(int)
)

# Optional: Schwellwert einfügen (z. B. High Risk ab 2)
df["risk_level"] = df["risk_score"].apply(
    lambda x: "High" if x >= 2 else ("Medium" if x == 1 else "Low")
)

df[["UserID", "non_eu_flag", "consent_violation_flag", "admin_access_outside_hours", "risk_score", "risk_level"]].head()


Unnamed: 0,UserID,non_eu_flag,consent_violation_flag,admin_access_outside_hours,risk_score,risk_level
0,U001,False,False,False,0,Low
1,U002,False,False,False,0,Low
2,U003,False,True,False,1,Medium
3,U004,False,False,False,0,Low
4,U005,False,False,False,0,Low
