<a href="https://colab.research.google.com/github/efrat-dev/insider-threat-detector/blob/main/generate_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_security_dataset(rows=5000):
    data = []
    start_date = datetime(2023, 1, 1)

    # # Define countries by categories
    regular_countries = ['Israel', 'USA', 'UK', 'France', 'Germany', 'Spain', 'Italy', 'Japan']
    hostile_countries = ['Iran', 'Syria', 'Lebanon', 'Russia', 'Ukraine']

    # Employee IDs
    employee_ids = range(1, 201)  # 200 עובדים

    for _ in range(rows):
        # Choose whether the employee will be suspicious (10% chance)
        is_suspicious = random.random() < 0.10

        # Select suspicion level (for suspicious employees)
        if is_suspicious:
            suspicion_level = random.choices(['high', 'medium', 'low'], weights=[0.1, 0.4, 0.5])[0]

        # Generate baseline data
        emp_id = random.choice(employee_ids)
        date = start_date + timedelta(days=random.randint(0, 364))

        # Define features based on suspicion level
        if is_suspicious:
            if suspicion_level == 'high':
                print_volume = random.randint(400, 600)
                odd_hours_volume = random.randint(100, 200)
                presence_hours = round(random.uniform(10.5, 12), 1)
                country = random.choice(hostile_countries)
                foreign_travel = random.randint(1, 3)
                hostile_travel = random.randint(1, 2)
            elif suspicion_level == 'medium':
                print_volume = random.randint(150, 250)
                odd_hours_volume = random.randint(30, 70)
                presence_hours = round(random.uniform(9.5, 10.5), 1)
                country = random.choice(hostile_countries)
                foreign_travel = random.randint(1, 2)
                hostile_travel = random.randint(0, 1)
            else:  # low
                print_volume = random.randint(110, 150)
                odd_hours_volume = random.randint(10, 30)
                presence_hours = round(random.uniform(8.8, 9.5), 1)
                country = random.choice(hostile_countries + regular_countries)
                foreign_travel = random.randint(0, 1)
                hostile_travel = 0 if country in regular_countries else random.randint(0, 1)
        else:
            print_volume = random.randint(70, 110)
            odd_hours_volume = random.randint(0, 10)
            presence_hours = round(random.uniform(8.0, 9.0), 1)
            country = random.choice(regular_countries)
            foreign_travel = random.randint(0, 1)
            hostile_travel = 0

        # Calculate number of prints (approx. 2MB per print)
        print_count = int(print_volume / 2)
        odd_hours_count = int(odd_hours_volume / 2)

        # Define the remaining features
        foreign_citizenship = 'Yes' if (country in hostile_countries or random.random() < 0.1) else 'No'
        criminal_record = 'Yes' if (is_suspicious and random.random() < 0.2) else 'No'
        medical_issues = 'Yes' if random.random() < 0.15 else 'No'
        security_violations = 'Yes' if (is_suspicious and random.random() < 0.3) else 'No'
        disciplinary_actions = 'Yes' if (is_suspicious and random.random() < 0.25) else 'No'
        odd_hours_entry = 'Yes' if odd_hours_volume > 0 else 'No'

        data.append({
            'Employee_ID': emp_id,
            'Date': date.strftime('%Y-%m-%d'),
            'Print_Count': print_count,
            'Print_Volume_MB': print_volume,
            'Odd_Hours_Print_Count': odd_hours_count,
            'Odd_Hours_Print_Volume_MB': odd_hours_volume,
            'Total_Presence_Hours': presence_hours,
            'Odd_Hours_Entry': odd_hours_entry,
            'Foreign_Travel_Count': foreign_travel,
            'Travel_To_Hostile_Countries_Count': hostile_travel,
            'Country_Of_Origin': country,
            'Foreign_Citizenship': foreign_citizenship,
            'Criminal_Record': criminal_record,
            'Medical_Issues': medical_issues,
            'Security_Violations': security_violations,
            'Disciplinary_Actions': disciplinary_actions,
            'Suspicious_Flag': 1 if is_suspicious else 0
        })

    df = pd.DataFrame(data)
    return df

# Create the dataset
df = generate_security_dataset(5000)

# Sort by date and employee ID
df = df.sort_values(['Date', 'Employee_ID'])

# Save the file
df.to_csv('employee_security_analysis.csv', index=False)

# Display basic statistics
print("Total Records:", len(df))
print("\nSuspicious Records:", len(df[df['Suspicious_Flag'] == 1]))
print("Normal Records:", len(df[df['Suspicious_Flag'] == 0]))
print("\nSample of the data:")
print(df.head())

Total Records: 5000

Suspicious Records: 484
Normal Records: 4516

Sample of the data:
      Employee_ID        Date  Print_Count  Print_Volume_MB  \
4283            5  2023-01-01           44               88   
2261           22  2023-01-01           41               83   
4082           61  2023-01-01           54              109   
4633           63  2023-01-01           41               83   
891            66  2023-01-01           46               93   

      Odd_Hours_Print_Count  Odd_Hours_Print_Volume_MB  Total_Presence_Hours  \
4283                      5                         10                   8.4   
2261                      3                          6                   8.8   
4082                      3                          7                   8.3   
4633                      3                          6                   8.1   
891                       4                          9                   8.7   

     Odd_Hours_Entry  Foreign_Travel_Count  Travel_To