<a href="https://colab.research.google.com/github/efrat-dev/insider-threat-detector/blob/main/23_6_new_data_aligned_with_behavioral_role_patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import random

class AdvancedInsiderThreatDataGenerator:
    def __init__(self, num_employees=1000, days_range=180, malicious_ratio=0.05):
        self.num_employees = num_employees
        self.days_range = days_range
        self.malicious_ratio = malicious_ratio
        self.malicious_employees = int(num_employees * malicious_ratio)

        # Define department-position mapping based on provided table
        self.department_positions = {
            'Executive Management': [
                'Chief Executive Officer (CEO)',
                'Chief Legal Officer',
                'Chief Human Resources Officer (CHRO)',
                'Chief Information Officer (CIO)',
                'Chief Technology Officer (CTO)',
                'Chief Operating Officer (COO)',
                'Chief Financial Officer (CFO)',
                'Chief Marketing and Business Development Officer',
                'Secretary'
            ],
            'R&D Department': [
                'Head of R&D',
                'Systems Engineer',
                'Development Engineer (Hardware / Software / Mechanical)',
                'Algorithm Engineer',
                'Integration and Testing Engineer',
                'Secretary'
            ],
            'Engineering Department': [
                'Process Engineer',
                'Design Engineer',
                'Head of Engineering',
                'Systems Engineer',
                'Test Engineer',
                'Secretary'
            ],
            'Operations and Manufacturing': [
                'Operations Manager',
                'Manufacturing Engineer',
                'Logistics Manager',
                'Procurement Officer',
                'Warehouse Manager',
                'Secretary'
            ],
            'Project Management': [
                'Project Manager',
                'Project Engineer',
                'Project Coordinator',
                'Secretary'
            ],
            'Security and Information Security': [
                'Physical Access Control',
                'Information Security Investigator',
                'Cyber Analyst',
                'Chief Information Security Officer (CISO)',
                'Security Officer'
            ],
            'Human Resources': [
                'HR Manager',
                'Recruitment Coordinator',
                'Employee Welfare Coordinator',
                'Training Coordinator',
                'Secretary'
            ],
            'Legal and Regulation': [
                'Regulatory Affairs Officer',
                'Defense Export Compliance Officer',
                'Legal Advisor'
            ],
            'Finance': [
                'Accountant',
                'Financial Analyst',
                'Budget Manager',
                'Finance Manager',
                'Secretary'
            ],
            'Marketing and Business Development': [
                'Business Development Manager',
                'Account Manager',
                'Bid Coordinator',
                'Marketing Manager',
                'Secretary'
            ],
            'Information Technology': [
                'IT Director',
                'Information Security Specialist',
                'Systems and Network Administrator',
                'BI Developer / Data Analyst',
                'Enterprise Systems Developer (ERP / CRM / SAP)',
                'Data Scientist',
                'Secretary'
            ]
        }

        # Define behavioral groups
        self.behavioral_groups = {
            'Executive Management': 'A',
            'Marketing and Business Development': 'D',
            'R&D Department': 'B',
            'Engineering Department': 'B',
            'Information Technology': 'F',
            'Security and Information Security': 'E',
            'Human Resources': 'C',
            'Finance': 'C',
            'Legal and Regulation': 'C',
            'Operations and Manufacturing': 'C',
            'Project Management': 'C'
        }

        # Define behavioral patterns for each group
        self.group_patterns = {
            'A': {  # Executive Management
                'work_hours': {'start_mean': 7.5, 'start_std': 1.0, 'end_mean': 18.5, 'end_std': 1.5},
                'print_likelihood': 0.4,
                'print_volume': {'commands_mean': 4, 'pages_mean': 8, 'color_ratio': 0.4},
                'burn_likelihood': 0.08,
                'burn_params': {'requests_mean': 2, 'volume_mean': 7.5, 'files_mean': 15, 'high_classification': True},
                'travel_likelihood': 0.015,  # Higher travel frequency
                'off_hours_tendency': 0.3
            },
            'B': {  # Developers & Engineers
                'work_hours': {'start_mean': 8.5, 'start_std': 0.8, 'end_mean': 18.0, 'end_std': 2.0},
                'print_likelihood': 0.2,
                'print_volume': {'commands_mean': 2, 'pages_mean': 4, 'color_ratio': 0.1},
                'burn_likelihood': 0.12,
                'burn_params': {'requests_mean': 3, 'volume_mean': 6.8, 'files_mean': 35, 'high_classification': False},
                'travel_likelihood': 0.003,
                'off_hours_tendency': 0.4  # More likely to work late
            },
            'C': {  # Office Workers & Secretaries
                'work_hours': {'start_mean': 8.0, 'start_std': 0.3, 'end_mean': 16.5, 'end_std': 0.5},
                'print_likelihood': 0.6,
                'print_volume': {'commands_mean': 5, 'pages_mean': 12, 'color_ratio': 0.25},
                'burn_likelihood': 0.03,
                'burn_params': {'requests_mean': 1, 'volume_mean': 5.5, 'files_mean': 8, 'high_classification': False},
                'travel_likelihood': 0.001,
                'off_hours_tendency': 0.05
            },
            'D': {  # Marketing & Business Development
                'work_hours': {'start_mean': 8.2, 'start_std': 1.0, 'end_mean': 17.8, 'end_std': 1.8},
                'print_likelihood': 0.7,
                'print_volume': {'commands_mean': 6, 'pages_mean': 15, 'color_ratio': 0.6},
                'burn_likelihood': 0.06,
                'burn_params': {'requests_mean': 2, 'volume_mean': 6.5, 'files_mean': 20, 'high_classification': False},
                'travel_likelihood': 0.012,
                'off_hours_tendency': 0.2
            },
            'E': {  # Security
                'work_hours': {'start_mean': 8.0, 'start_std': 4.0, 'end_mean': 17.0, 'end_std': 4.0},  # 24/7 shifts
                'print_likelihood': 0.15,
                'print_volume': {'commands_mean': 2, 'pages_mean': 3, 'color_ratio': 0.05},
                'burn_likelihood': 0.04,
                'burn_params': {'requests_mean': 1, 'volume_mean': 6.0, 'files_mean': 5, 'high_classification': True},
                'travel_likelihood': 0.001,
                'off_hours_tendency': 0.8,  # Security works all hours
                'weekend_work': 0.6
            },
            'F': {  # IT
                'work_hours': {'start_mean': 8.5, 'start_std': 1.2, 'end_mean': 17.5, 'end_std': 2.5},
                'print_likelihood': 0.25,
                'print_volume': {'commands_mean': 3, 'pages_mean': 6, 'color_ratio': 0.15},
                'burn_likelihood': 0.15,
                'burn_params': {'requests_mean': 4, 'volume_mean': 7.2, 'files_mean': 45, 'high_classification': False},
                'travel_likelihood': 0.002,
                'off_hours_tendency': 0.35
            }
        }

        self.campuses = ['Campus A', 'Campus B', 'Campus C']
        self.origin_countries = ['Israel', 'USA', 'Russia', 'China', 'India', 'Germany', 'France', 'UK', 'Canada', 'Australia', 'Other']
        self.countries = ['USA', 'Germany', 'France', 'UK', 'Canada', 'Australia', 'Japan', 'South Korea', 'Singapore', 'India', 'China', 'Russia', 'Turkey', 'UAE', 'Other']
        self.hostile_countries = ['Iran', 'Russia', 'China', 'North Korea', 'Syria']

        # Generate employees and select malicious ones
        self.employees = self._generate_employee_profiles()
        self.malicious_employee_ids = set(random.sample(list(self.employees.keys()), self.malicious_employees))
        self.employee_trips = {}

        print(f"Generated {len(self.employees)} employees")
        print(f"Department distribution:")
        dept_counts = {}
        for emp in self.employees.values():
            dept_counts[emp['department']] = dept_counts.get(emp['department'], 0) + 1
        for dept, count in dept_counts.items():
            print(f"  {dept}: {count}")

    def _generate_employee_profiles(self):
        """Generate realistic employee profiles with department-position alignment"""
        employees = {}
        num_digits = len(str(self.num_employees))

        # Create weighted department selection (some departments are larger)
        dept_weights = {
            'R&D Department': 0.25,
            'Engineering Department': 0.20,
            'Information Technology': 0.12,
            'Operations and Manufacturing': 0.15,
            'Marketing and Business Development': 0.08,
            'Project Management': 0.08,
            'Finance': 0.04,
            'Human Resources': 0.03,
            'Security and Information Security': 0.03,
            'Legal and Regulation': 0.015,
            'Executive Management': 0.005
        }

        departments = list(dept_weights.keys())
        weights = list(dept_weights.values())

        for i in range(self.num_employees):
            emp_id = str(i + 1).zfill(num_digits)

            # Select department based on weights
            department = np.random.choice(departments, p=weights)

            # Select position within department
            position = np.random.choice(self.department_positions[department])

            # Assign origin country
            origin_country = np.random.choice(self.origin_countries,
                                            p=[0.05, 0.03, 0.02, 0.05, 0.05, 0.2, 0.15, 0.1, 0.1, 0.1, 0.15])

            # Seniority based on position
            if 'Chief' in position or 'Head of' in position or 'Director' in position:
                seniority_years = np.random.randint(8, 31)
            elif 'Manager' in position:
                seniority_years = np.random.randint(5, 21)
            elif 'Secretary' in position:
                seniority_years = np.random.randint(1, 16)
            else:
                seniority_years = np.random.randint(0, 26)

            # Classification based on department and position
            if department == 'Executive Management':
                classification = np.random.choice([3, 4], p=[0.3, 0.7])
            elif department in ['Security and Information Security', 'Legal and Regulation']:
                classification = np.random.choice([2, 3, 4], p=[0.2, 0.5, 0.3])
            elif department in ['R&D Department', 'Engineering Department']:
                classification = np.random.choice([2, 3], p=[0.6, 0.4])
            else:
                classification = np.random.choice([1, 2, 3], p=[0.5, 0.4, 0.1])

            employees[emp_id] = {
                'department': department,
                'position': position,
                'behavioral_group': self.behavioral_groups[department],
                'campus': np.random.choice(self.campuses),
                'seniority_years': seniority_years,
                'is_contractor': np.random.choice([0, 1], p=[0.85, 0.15]),
                'classification': classification,
                'foreign_citizenship': np.random.choice([0, 1], p=[0.85, 0.15]),
                'criminal_record': np.random.choice([0, 1], p=[0.95, 0.05]),
                'medical_history': np.random.choice([0, 1], p=[0.85, 0.15]),
                'origin_country': origin_country
            }

        return employees

    def _get_work_hours(self, emp_id, date, is_malicious):
        """Generate work hours based on behavioral group"""
        emp = self.employees[emp_id]
        group = emp['behavioral_group']
        pattern = self.group_patterns[group]

        # Base work hours
        start_hour = np.random.normal(pattern['work_hours']['start_mean'], pattern['work_hours']['start_std'])
        end_hour = np.random.normal(pattern['work_hours']['end_mean'], pattern['work_hours']['end_std'])

        # Ensure logical bounds
        start_hour = max(5, min(12, start_hour))
        end_hour = max(start_hour + 4, min(23, end_hour))

        # Malicious employees might have more irregular hours
        if is_malicious and np.random.random() < 0.3:
            if np.random.random() < 0.5:
                start_hour = np.random.uniform(5, 7)  # Very early
            else:
                end_hour = np.random.uniform(20, 23)  # Very late

        return start_hour, end_hour

    def _generate_print_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate printing activity based on behavioral patterns"""
        if is_abroad and not is_malicious and np.random.random() < 0.98:
            return self._empty_print_activity()

        if is_abroad and is_malicious and np.random.random() < 0.85:
            return self._empty_print_activity()

        emp = self.employees[emp_id]
        group = emp['behavioral_group']
        pattern = self.group_patterns[group]

        # Check if employee prints today based on group likelihood
        if np.random.random() > pattern['print_likelihood']:
            return self._empty_print_activity()

        # Generate printing based on group patterns
        if is_malicious:
            # Malicious employees print more
            multiplier = np.random.uniform(1.5, 3.0)
        else:
            multiplier = np.random.uniform(0.8, 1.2)

        num_commands = max(1, int(np.random.poisson(pattern['print_volume']['commands_mean']) * multiplier))
        total_pages = max(1, int(np.random.poisson(pattern['print_volume']['pages_mean'] * num_commands) * multiplier))
        color_ratio = pattern['print_volume']['color_ratio']

        # Add some randomness to color ratio
        color_ratio = max(0, min(1, np.random.normal(color_ratio, 0.1)))

        # Calculate off-hours printing
        off_hours_tendency = pattern.get('off_hours_tendency', 0.1)
        if is_malicious:
            off_hours_tendency *= 2

        off_hours_commands = 0
        off_hours_pages = 0
        if np.random.random() < off_hours_tendency:
            off_hours_commands = max(0, int(num_commands * np.random.uniform(0.2, 0.8)))
            off_hours_pages = max(0, int(total_pages * np.random.uniform(0.2, 0.8)))

        # Multiple campus printing
        employee_campus = emp['campus']
        print_campuses = 1
        printed_from_other = 0

        if is_malicious and np.random.random() < 0.25:
            print_campuses = np.random.choice([2, 3])
            printed_from_other = 1
        elif np.random.random() < 0.05:
            print_campuses = 2
            printed_from_other = 1

        num_color = int(total_pages * color_ratio)
        num_bw = total_pages - num_color

        return {
            'num_print_commands': num_commands,
            'total_printed_pages': total_pages,
            'num_print_commands_off_hours': off_hours_commands,
            'num_printed_pages_off_hours': off_hours_pages,
            'num_color_prints': num_color,
            'num_bw_prints': num_bw,
            'ratio_color_prints': color_ratio if total_pages > 0 else 0,
            'printed_from_other': printed_from_other,
            'print_campuses': print_campuses
        }

    def _generate_burn_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate burning activity based on behavioral patterns"""
        if is_abroad and not is_malicious and np.random.random() < 0.99:
            return self._empty_burn_activity()

        if is_abroad and is_malicious and np.random.random() < 0.90:
            return self._empty_burn_activity()

        emp = self.employees[emp_id]
        group = emp['behavioral_group']
        pattern = self.group_patterns[group]

        # Check if employee burns today based on group likelihood
        base_likelihood = pattern['burn_likelihood']
        if is_malicious:
            base_likelihood *= 3

        if np.random.random() > base_likelihood:
            return self._empty_burn_activity()

        # Generate burning based on group patterns
        burn_params = pattern['burn_params']

        if is_malicious:
            num_requests = max(1, int(np.random.poisson(burn_params['requests_mean']) * np.random.uniform(1.5, 2.5)))
            volume_mb = np.random.lognormal(burn_params['volume_mean'], 1.5)
            num_files = max(1, int(np.random.poisson(burn_params['files_mean']) * np.random.uniform(1.8, 3.0)))
        else:
            num_requests = max(1, np.random.poisson(burn_params['requests_mean']))
            volume_mb = np.random.lognormal(burn_params['volume_mean'], 1.0)
            num_files = max(1, np.random.poisson(burn_params['files_mean']))

        # Classification levels
        employee_classification = emp['classification']
        if burn_params['high_classification'] or is_malicious:
            max_classification = min(4, employee_classification + np.random.choice([0, 1, 2], p=[0.3, 0.4, 0.3]))
        else:
            max_classification = min(employee_classification, np.random.choice([1, 2, 3], p=[0.6, 0.3, 0.1]))

        classifications = [np.random.randint(1, max_classification + 1) for _ in range(num_requests)]

        # Off-hours burning
        off_hours_requests = 0
        off_hours_tendency = pattern.get('off_hours_tendency', 0.1)
        if is_malicious and np.random.random() < off_hours_tendency:
            off_hours_requests = max(0, int(num_requests * np.random.uniform(0.3, 0.8)))

        # Multiple campus burning
        burned_from_other = 0
        burn_campuses = 1
        if is_malicious and np.random.random() < 0.2:
            burn_campuses = np.random.choice([2, 3])
            burned_from_other = 1

        return {
            'num_burn_requests': num_requests,
            'max_request_classification': max(classifications),
            'avg_request_classification': np.mean(classifications),
            'num_burn_requests_off_hours': off_hours_requests,
            'total_burn_volume_mb': int(volume_mb),
            'total_files_burned': num_files,
            'burned_from_other': burned_from_other,
            'burn_campuses': burn_campuses
        }

    def _generate_travel_activity(self, emp_id, date):
        """Generate travel activity based on behavioral patterns"""
        # Check if employee has ongoing trip
        if emp_id in self.employee_trips:
            trip = self.employee_trips[emp_id]
            days_since_start = (date - trip['start_date']).days

            if days_since_start < trip['duration']:
                return {
                    'is_abroad': 1,
                    'trip_day_number': days_since_start + 1,
                    'country_name': trip['country'],
                    'is_hostile_country_trip': 1 if trip['country'] in self.hostile_countries else 0,
                    'is_official_trip': trip['is_official'],
                    'is_origin_country_trip': 1 if trip['country'] == self.employees[emp_id]['origin_country'] else 0
                }
            else:
                del self.employee_trips[emp_id]
                return self._no_travel_activity()

        # Check if starting new trip based on group patterns
        emp = self.employees[emp_id]
        group = emp['behavioral_group']
        pattern = self.group_patterns[group]
        is_malicious = emp_id in self.malicious_employee_ids

        travel_likelihood = pattern['travel_likelihood']
        if is_malicious:
            travel_likelihood *= 1.5

        if np.random.random() < travel_likelihood:
            return self._start_new_trip(emp_id, date, is_malicious)

        return self._no_travel_activity()

    def _start_new_trip(self, emp_id, date, is_malicious):
        """Start new trip for employee"""
        # Select destination
        if is_malicious and np.random.random() < 0.3:
            country = np.random.choice(self.hostile_countries)
        else:
            country = np.random.choice(self.countries)

        # Determine if official trip
        is_official = np.random.choice([0, 1], p=[0.3, 0.7])

        # Check if visiting origin country
        origin_country = self.employees[emp_id]['origin_country']
        is_origin_trip = 1 if country == origin_country else 0

        if is_origin_trip and np.random.random() < 0.6:
            is_official = 0

        # Trip duration
        duration = np.random.randint(1, 15)

        self.employee_trips[emp_id] = {
            'country': country,
            'start_date': date,
            'duration': duration,
            'is_official': is_official
        }

        return {
            'is_abroad': 1,
            'trip_day_number': 1,
            'country_name': country,
            'is_hostile_country_trip': 1 if country in self.hostile_countries else 0,
            'is_official_trip': is_official,
            'is_origin_country_trip': is_origin_trip
        }

    def _generate_access_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate building access activity based on behavioral patterns"""
        if is_abroad:
            if is_malicious and np.random.random() < 0.05:
                pass  # Suspicious access while abroad
            elif not is_malicious and np.random.random() < 0.001:
                pass  # Rare legitimate access
            else:
                return self._empty_access_activity()

        if np.random.random() < 0.05:  # Absence
            return self._empty_access_activity()

        emp = self.employees[emp_id]
        group = emp['behavioral_group']

        # Get work hours for this employee
        start_hour, end_hour = self._get_work_hours(emp_id, date, is_malicious)

        base_date = datetime.combine(date, datetime.min.time())
        first_entry = base_date + timedelta(hours=start_hour)
        last_exit = base_date + timedelta(hours=end_hour)

        # Number of entries/exits
        if is_malicious and np.random.random() < 0.2:
            num_entries = np.random.choice([2, 3, 4], p=[0.5, 0.3, 0.2])
        else:
            num_entries = np.random.choice([1, 2], p=[0.8, 0.2])

        num_exits = num_entries

        # Calculate presence time
        total_minutes = int((last_exit - first_entry).total_seconds() / 60)

        # Flags based on work hours
        early_entry = 1 if first_entry.hour < 6 else 0
        late_exit = 1 if last_exit.hour > 22 else 0
        night_entry = 1 if first_entry.hour <= 5 or first_entry.hour >= 22 else 0

        # Weekend work based on group patterns
        weekend_entry = 0
        if date.weekday() >= 4:  # Friday or Saturday
            if group == 'E':  # Security works weekends
                weekend_entry = 1 if np.random.random() < self.group_patterns[group].get('weekend_work', 0.1) else 0
            elif is_malicious and np.random.random() < 0.3:
                weekend_entry = 1
            elif np.random.random() < 0.05:
                weekend_entry = 1

            if weekend_entry == 0:
                return self._empty_access_activity()

        # Multiple campus access
        num_unique_campus = 1
        if is_malicious and np.random.random() < 0.15:
            num_unique_campus = np.random.choice([2, 3])

        return {
            'num_entries': num_entries,
            'num_exits': num_exits,
            'first_entry_time': first_entry.strftime('%H:%M'),
            'last_exit_time': last_exit.strftime('%H:%M'),
            'total_presence_minutes': total_minutes,
            'entered_during_night_hours': night_entry,
            'num_unique_campus': num_unique_campus,
            'early_entry_flag': early_entry,
            'late_exit_flag': late_exit,
            'entry_during_weekend': weekend_entry
        }

    def _empty_print_activity(self):
        return {
            'num_print_commands': 0, 'total_printed_pages': 0, 'num_print_commands_off_hours': 0,
            'num_printed_pages_off_hours': 0, 'num_color_prints': 0, 'num_bw_prints': 0,
            'ratio_color_prints': 0, 'printed_from_other': 0, 'print_campuses': 0
        }

    def _empty_burn_activity(self):
        return {
            'num_burn_requests': 0, 'max_request_classification': 0, 'avg_request_classification': 0,
            'num_burn_requests_off_hours': 0, 'total_burn_volume_mb': 0, 'total_files_burned': 0,
            'burned_from_other': 0, 'burn_campuses': 0
        }

    def _no_travel_activity(self):
        return {
            'is_abroad': 0, 'trip_day_number': None, 'country_name': None,
            'is_hostile_country_trip': 0, 'is_official_trip': 0, 'is_origin_country_trip': 0
        }

    def _empty_access_activity(self):
        return {
            'num_entries': 0, 'num_exits': 0, 'first_entry_time': None, 'last_exit_time': None,
            'total_presence_minutes': 0, 'entered_during_night_hours': 0, 'num_unique_campus': 0,
            'early_entry_flag': 0, 'late_exit_flag': 0, 'entry_during_weekend': 0
        }

    def generate_dataset(self):
        """Generate the complete dataset with behavioral patterns"""
        print(f"Generating advanced dataset with {self.num_employees} employees over {self.days_range} days...")
        print(f"Malicious employees: {self.malicious_employees} ({self.malicious_ratio:.1%})")

        data = []
        start_date = datetime.now() - timedelta(days=self.days_range)

        for emp_id in list(self.employees.keys()):
            is_malicious = emp_id in self.malicious_employee_ids

            for day in range(self.days_range):
                current_date = start_date + timedelta(days=day)

                # Employee static info
                emp_info = self.employees[emp_id]

                # Generate travel activity
                travel_activity = self._generate_travel_activity(emp_id, current_date.date())
                is_abroad = travel_activity['is_abroad'] == 1

                # Generate daily activities based on behavioral patterns
                print_activity = self._generate_print_activity(emp_id, current_date.date(), is_malicious, is_abroad)
                burn_activity = self._generate_burn_activity(emp_id, current_date.date(), is_malicious, is_abroad)
                access_activity = self._generate_access_activity(emp_id, current_date.date(), is_malicious, is_abroad)

                # Calculate risk travel indicator
                risk_travel_indicator = 0
                if (travel_activity['is_abroad'] == 1 and
                    travel_activity['is_official_trip'] == 0 and travel_activity['is_hostile_country_trip'] == 1 and
                    (burn_activity['total_files_burned'] > 0 or print_activity['total_printed_pages'] > 0)):
                    risk_travel_indicator = 1

                # Combine all data
                row = {
                    'employee_id': emp_id,
                    'date': current_date.date(),
                    'employee_department': emp_info['department'],
                    'employee_campus': emp_info['campus'],
                    'employee_position': emp_info['position'],
                    'employee_seniority_years': emp_info['seniority_years'],
                    'is_contractor': emp_info['is_contractor'],
                    'employee_classification': emp_info['classification'],
                    'has_foreign_citizenship': emp_info['foreign_citizenship'],
                    'has_criminal_record': emp_info['criminal_record'],
                    'has_medical_history': emp_info['medical_history'],
                    'employee_origin_country': emp_info['origin_country'],
                    'behavioral_group': emp_info['behavioral_group'],
                    'is_malicious': 1 if is_malicious else 0,  # Target variable
                    'risk_travel_indicator': risk_travel_indicator
                }

                # Add all activity data
                row.update(print_activity)
                row.update(burn_activity)
                row.update(travel_activity)
                row.update(access_activity)

                data.append(row)

        df = pd.DataFrame(data)

        # Convert trip_day_number to int where not null
        df['trip_day_number'] = df['trip_day_number'].astype('Int64')  # Nullable integer

        print(f"Dataset generated: {len(df)} records")
        print(f"Malicious records: {df['is_malicious'].sum()}")
        print(f"Behavioral group distribution:")
        print(df['behavioral_group'].value_counts().sort_index())

        return df

    def generate_summary_statistics(self, df):
        """Generate summary statistics by behavioral group"""
        print("\n=== BEHAVIORAL GROUP ANALYSIS ===")

        for group in sorted(df['behavioral_group'].unique()):
            group_data = df[df['behavioral_group'] == group]
            group_name = [k for k, v in self.behavioral_groups.items() if v == group][0]

            print(f"\nGroup {group} - {group_name}:")
            print(f"  Employees: {group_data['employee_id'].nunique()}")
            print(f"  Total records: {len(group_data)}")

            # Work hours analysis
            work_data = group_data[group_data['first_entry_time'].notna()]
            if len(work_data) > 0:
                entry_times = pd.to_datetime(work_data['first_entry_time'], format='%H:%M').dt.hour
                exit_times = pd.to_datetime(work_data['last_exit_time'], format='%H:%M').dt.hour

                print(f"  Average entry time: {entry_times.mean():.1f}:00")
                print(f"  Average exit time: {exit_times.mean():.1f}:00")
                print(f"  Off-hours work: {work_data['early_entry_flag'].mean():.1%} early, {work_data['late_exit_flag'].mean():.1%} late")
                print(f"  Weekend work: {work_data['entry_during_weekend'].mean():.1%}")

            # Printing analysis
            print_data = group_data[group_data['total_printed_pages'] > 0]
            if len(print_data) > 0:
                print(f"  Printing frequency: {len(print_data)/len(group_data):.1%}")
                print(f"  Avg pages per print day: {print_data['total_printed_pages'].mean():.1f}")
                print(f"  Color print ratio: {print_data['ratio_color_prints'].mean():.1%}")
                print(f"  Off-hours printing: {print_data['num_print_commands_off_hours'].sum()}/{print_data['num_print_commands'].sum():.0f} commands")

            # Burning analysis
            burn_data = group_data[group_data['num_burn_requests'] > 0]
            if len(burn_data) > 0:
                print(f"  Burning frequency: {len(burn_data)/len(group_data):.1%}")
                print(f"  Avg classification: {burn_data['avg_request_classification'].mean():.1f}")
                print(f"  Avg volume (MB): {burn_data['total_burn_volume_mb'].mean():.0f}")
                print(f"  Avg files per burn: {burn_data['total_files_burned'].mean():.0f}")

            # Travel analysis
            travel_data = group_data[group_data['is_abroad'] == 1]
            if len(travel_data) > 0:
                print(f"  Travel frequency: {len(travel_data)/len(group_data):.1%}")
                print(f"  Official travel ratio: {travel_data['is_official_trip'].mean():.1%}")
                print(f"  Hostile country visits: {travel_data['is_hostile_country_trip'].mean():.1%}")

    def export_dataset(self, df, filename_prefix="insider_threat_advanced"):
        """Export dataset to various formats"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # CSV export
        csv_filename = f"{filename_prefix}_{timestamp}.csv"
        df.to_csv(csv_filename, index=False)
        print(f"Dataset exported to {csv_filename}")

        # Excel export with multiple sheets
        excel_filename = f"{filename_prefix}_{timestamp}.xlsx"
        with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='Full_Dataset', index=False)

            # Malicious employees only
            malicious_df = df[df['is_malicious'] == 1]
            malicious_df.to_excel(writer, sheet_name='Malicious_Only', index=False)

            # Summary by behavioral group
            summary_stats = []
            for group in sorted(df['behavioral_group'].unique()):
                group_data = df[df['behavioral_group'] == group]
                group_name = [k for k, v in self.behavioral_groups.items() if v == group][0]

                stats = {
                    'Behavioral_Group': group,
                    'Department': group_name,
                    'Total_Employees': group_data['employee_id'].nunique(),
                    'Total_Records': len(group_data),
                    'Malicious_Employees': group_data[group_data['is_malicious'] == 1]['employee_id'].nunique(),
                    'Print_Frequency': len(group_data[group_data['total_printed_pages'] > 0]) / len(group_data),
                    'Burn_Frequency': len(group_data[group_data['num_burn_requests'] > 0]) / len(group_data),
                    'Travel_Frequency': len(group_data[group_data['is_abroad'] == 1]) / len(group_data),
                    'Avg_Pages_Per_Day': group_data['total_printed_pages'].mean(),
                    'Avg_Burn_Volume_MB': group_data['total_burn_volume_mb'].mean(),
                    'Weekend_Work_Rate': group_data['entry_during_weekend'].mean()
                }
                summary_stats.append(stats)

            summary_df = pd.DataFrame(summary_stats)
            summary_df.to_excel(writer, sheet_name='Group_Summary', index=False)

        print(f"Dataset exported to {excel_filename} with multiple sheets")

        return csv_filename, excel_filename

def main():
    """Main function to generate and analyze the dataset"""
    print("בס\"ד")
    print("=== Advanced Insider Threat Dataset Generator ===")
    print("Generating realistic behavioral patterns by job role...")

    # Initialize generator
    generator = AdvancedInsiderThreatDataGenerator(
        num_employees=1000,
        days_range=180,
        malicious_ratio=0.05
    )

    # Generate dataset
    print("\nGenerating dataset...")
    df = generator.generate_dataset()

    # Generate analysis
    print("\nGenerating behavioral analysis...")
    generator.generate_summary_statistics(df)

    # Export datasets
    print("\nExporting datasets...")
    csv_file, excel_file = generator.export_dataset(df)

    # Final statistics
    print(f"\n=== FINAL DATASET STATISTICS ===")
    print(f"Total records: {len(df):,}")
    print(f"Total employees: {df['employee_id'].nunique():,}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"Malicious employees: {df[df['is_malicious']==1]['employee_id'].nunique()}")
    print(f"Malicious records: {df['is_malicious'].sum():,} ({df['is_malicious'].mean():.1%})")

    print(f"\nDepartment distribution:")
    dept_counts = df.groupby('employee_department')['employee_id'].nunique().sort_values(ascending=False)
    for dept, count in dept_counts.items():
        print(f"  {dept}: {count} employees")

    print(f"\nBehavioral group distribution:")
    group_counts = df.groupby('behavioral_group')['employee_id'].nunique().sort_index()
    for group, count in group_counts.items():
        group_name = [k for k, v in generator.behavioral_groups.items() if v == group][0]
        print(f"  Group {group} ({group_name}): {count} employees")

    # Data quality checks
    print(f"\n=== DATA QUALITY CHECKS ===")
    print(f"Missing values per column:")
    missing_data = df.isnull().sum()
    for col, missing in missing_data[missing_data > 0].items():
        print(f"  {col}: {missing} ({missing/len(df):.1%})")

    print(f"\nLogical consistency checks:")
    # Check if employees abroad have low building access
    abroad_data = df[df['is_abroad'] == 1]
    if len(abroad_data) > 0:
        no_access_abroad = len(abroad_data[abroad_data['num_entries'] == 0])
        print(f"  Employees abroad with no building access: {no_access_abroad}/{len(abroad_data)} ({no_access_abroad/len(abroad_data):.1%})")

    # Check if off-hours activities are reasonable
    total_print_commands = df['num_print_commands'].sum()
    off_hours_commands = df['num_print_commands_off_hours'].sum()
    if total_print_commands > 0:
        print(f"  Off-hours printing ratio: {off_hours_commands}/{total_print_commands} ({off_hours_commands/total_print_commands:.1%})")

    total_burn_requests = df['num_burn_requests'].sum()
    off_hours_burns = df['num_burn_requests_off_hours'].sum()
    if total_burn_requests > 0:
        print(f"  Off-hours burning ratio: {off_hours_burns}/{total_burn_requests} ({off_hours_burns/total_burn_requests:.1%})")

    print(f"\nDataset generation completed successfully!")
    print(f"Files created: {csv_file}, {excel_file}")

    return df, generator

if __name__ == "__main__":
    df, generator = main()

בס"ד
=== Advanced Insider Threat Dataset Generator ===
Generating realistic behavioral patterns by job role...
Generated 1000 employees
Department distribution:
  Information Technology: 129
  Engineering Department: 212
  Marketing and Business Development: 100
  Operations and Manufacturing: 141
  Executive Management: 5
  R&D Department: 226
  Security and Information Security: 30
  Project Management: 74
  Finance: 45
  Human Resources: 29
  Legal and Regulation: 9

Generating dataset...
Generating advanced dataset with 1000 employees over 180 days...
Malicious employees: 50 (5.0%)
Dataset generated: 180000 records
Malicious records: 9000
Behavioral group distribution:
behavioral_group
A      900
B    78840
C    53640
D    18000
E     5400
F    23220
Name: count, dtype: int64

Generating behavioral analysis...

=== BEHAVIORAL GROUP ANALYSIS ===

Group A - Executive Management:
  Employees: 5
  Total records: 900
  Average entry time: 7.0:00
  Average exit time: 18.0:00
  Off-hours 