<a href="https://colab.research.google.com/github/efrat-dev/insider-threat-detector/blob/main/changes_on_data_14_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class InsiderThreatDataGenerator:
    def __init__(self, num_employees=1000, days_range=180, malicious_ratio=0.07):
        self.num_employees = num_employees
        self.days_range = days_range
        self.malicious_ratio = malicious_ratio
        self.malicious_employees = int(num_employees * malicious_ratio)

        # Define realistic distributions and parameters (in English)
        self.departments = ['Technology', 'Finance', 'HR', 'Sales', 'R&D', 'Security', 'Legal', 'Marketing']
        self.campuses = ['A', 'B', 'C']
        self.jobs = ['Manager', 'Analyst', 'Developer', 'Accountant', 'Consultant', 'Researcher', 'Programmer', 'Project Manager']
        self.countries = ['USA', 'Germany', 'France', 'UK', 'Italy', 'Japan', 'Iran', 'Syria', 'Lebanon', 'North Korea', 'Israel', 'Russia', 'China']
        self.hostile_countries = ['Iran', 'Syria', 'Lebanon', 'North Korea']

        # Employee origin countries (for travel analysis)
        self.origin_countries = ['Iran', 'Syria', 'Lebanon', 'Russia', 'China', 'USA', 'Germany', 'France', 'UK', 'Italy', 'Japan']

        # Generate employee profiles
        self.employees = self._generate_employee_profiles()
        self.malicious_employee_ids = random.sample(list(self.employees.keys()), self.malicious_employees)

        # Track ongoing trips for each employee
        self.employee_trips = {}  # {emp_id: {'country': str, 'start_date': date, 'duration': int, 'is_official': bool}}

    def _generate_employee_profiles(self):
        """Generate realistic employee profiles"""
        employees = {}
        num_digits = len(str(self.num_employees))

        for i in range(self.num_employees):
            emp_id = str(i + 1).zfill(num_digits)  # Ensure same number of digits

            # Assign origin country
            origin_country = np.random.choice(self.origin_countries, p=[0.05, 0.03, 0.02, 0.05, 0.05, 0.2, 0.15, 0.1, 0.1, 0.1, 0.15])

            employees[emp_id] = {
                'department': np.random.choice(self.departments),
                'campus': np.random.choice(self.campuses),
                'job': np.random.choice(self.jobs),
                'seniority_years': np.random.randint(0, 51),  # 0 to 50 years
                'is_contractor': np.random.choice([0, 1], p=[0.8, 0.2]),
                'classification': np.random.choice([1, 2, 3, 4], p=[0.4, 0.3, 0.2, 0.1]),
                'foreign_citizenship': np.random.choice([0, 1], p=[0.85, 0.15]),
                'criminal_record': np.random.choice([0, 1], p=[0.95, 0.05]),
                'medical_history': np.random.choice([0, 1], p=[0.85, 0.15]),
                'origin_country': origin_country
            }
        return employees

    def _is_off_hours(self, dt):
        """Check if datetime is during off hours"""
        if dt.weekday() == 4 and dt.hour >= 16:  # Friday after 16:00
            return True
        if dt.weekday() == 5:  # Saturday
            return True
        if dt.time() < datetime.strptime("05:00", "%H:%M").time() or dt.time() > datetime.strptime("21:00", "%H:%M").time():
            return True
        return False

    def _is_weekend(self, dt):
        """Check if datetime is weekend"""
        return dt.weekday() >= 4  # Friday or Saturday

    def _is_night_hours(self, dt):
        """Check if datetime is during night hours (22:00-05:00)"""
        return dt.hour >= 22 or dt.hour <= 5

    def _generate_print_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate realistic printing activity"""
        base_date = datetime.combine(date, datetime.min.time())

        # If employee is abroad and not malicious, very low chance of printing
        if is_abroad and not is_malicious:
            if np.random.random() < 0.98:  # 98% chance of no printing when abroad
                return self._empty_print_activity()

        # If employee is abroad and malicious, moderate chance of printing (credential theft/collusion)
        if is_abroad and is_malicious:
            if np.random.random() < 0.85:  # 85% chance of no printing when abroad (they're careful)
                return self._empty_print_activity()

        # Normal employees print less
        if not is_malicious:
            if np.random.random() < 0.7:  # 70% chance of no printing
                return self._empty_print_activity()

            # Normal printing patterns
            num_commands = np.random.poisson(3)
            total_pages = sum(np.random.poisson(5) for _ in range(num_commands))
            color_ratio = np.random.beta(2, 8)  # Most prints are B&W

        else:
            # Malicious employees might print more, especially off-hours
            if np.random.random() < 0.9:  # Malicious employees print more often
                num_commands = np.random.poisson(8)
                total_pages = sum(np.random.poisson(12) for _ in range(num_commands))
                color_ratio = np.random.beta(3, 5)  # More color prints (documents, charts)
            else:
                return self._empty_print_activity()

        # Determine printing campuses
        employee_campus = self.employees[emp_id]['campus']
        print_campuses = set()

        # Decide how many campuses to print from
        if is_malicious and np.random.random() < 0.3:
            # Malicious might print from multiple campuses
            num_print_campuses = np.random.choice([1, 2, 3], p=[0.5, 0.35, 0.15])
        else:
            # Normal employees usually print from their own campus
            num_print_campuses = np.random.choice([1, 2], p=[0.9, 0.1])

        # Select campuses
        if num_print_campuses == 1:
            if np.random.random() < 0.9:  # 90% chance own campus
                print_campuses.add(employee_campus)
            else:
                print_campuses.add(np.random.choice(self.campuses))
        else:
            print_campuses.add(employee_campus)  # Always include own campus
            other_campuses = [c for c in self.campuses if c != employee_campus]
            additional_campuses = np.random.choice(other_campuses, size=min(num_print_campuses-1, len(other_campuses)), replace=False)
            print_campuses.update(additional_campuses)

        # Check if printed from other campus
        printed_from_other = 1 if any(campus != employee_campus for campus in print_campuses) else 0

        # Calculate off-hours printing
        off_hours_commands = 0
        off_hours_pages = 0

        if is_malicious and np.random.random() < 0.4:  # Malicious more likely to print off-hours
            off_hours_commands = max(0, int(num_commands * np.random.uniform(0.2, 0.8)))
            off_hours_pages = max(0, int(total_pages * np.random.uniform(0.2, 0.8)))
        elif np.random.random() < 0.1:  # Normal employees occasionally print off-hours
            off_hours_commands = max(0, int(num_commands * np.random.uniform(0.1, 0.3)))
            off_hours_pages = max(0, int(total_pages * np.random.uniform(0.1, 0.3)))

        num_color = int(total_pages * color_ratio)
        num_bw = total_pages - num_color

        return {
            'num_print_commands': num_commands,
            'total_printed_pages': total_pages,
            'num_print_commands_off_hours': off_hours_commands,
            'num_printed_pages_off_hours': off_hours_pages,
            'num_color_prints': num_color,
            'num_bw_prints': num_bw,
            'ratio_color_prints': color_ratio if total_pages > 0 else 0,
            'printed_from_other': printed_from_other,
            'print_campuses': len(print_campuses)
        }

    def _empty_print_activity(self):
        """Return empty print activity"""
        return {
            'num_print_commands': 0,
            'total_printed_pages': 0,
            'num_print_commands_off_hours': 0,
            'num_printed_pages_off_hours': 0,
            'num_color_prints': 0,
            'num_bw_prints': 0,
            'ratio_color_prints': 0,
            'printed_from_other': 0,
            'print_campuses': 0
        }

    def _generate_burn_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate CD/USB burning activity"""

        # If employee is abroad and not malicious, very low chance of burning
        if is_abroad and not is_malicious:
            if np.random.random() < 0.99:  # 99% chance of no burning when abroad
                return self._empty_burn_activity()

        # If employee is abroad and malicious, low chance of burning (they're careful)
        if is_abroad and is_malicious:
            if np.random.random() < 0.90:  # 90% chance of no burning when abroad
                return self._empty_burn_activity()

        # Most days have no burning activity
        if not is_malicious and np.random.random() < 0.95:
            return self._empty_burn_activity()

        if is_malicious and np.random.random() < 0.85:
            return self._empty_burn_activity()

        employee_campus = self.employees[emp_id]['campus']
        employee_classification = self.employees[emp_id]['classification']

        if is_malicious:
            num_requests = np.random.poisson(3) + 1
            # Malicious users try to access higher classification data
            max_classification = min(4, employee_classification + np.random.choice([0, 1, 2], p=[0.3, 0.4, 0.3]))
            classifications = [np.random.randint(1, max_classification + 1) for _ in range(num_requests)]
            volume_mb = np.random.lognormal(8, 1.5)  # Larger volumes
            num_files = np.random.poisson(50) + 10
        else:
            num_requests = 1
            max_classification = min(employee_classification, np.random.choice([1, 2, 3], p=[0.6, 0.3, 0.1]))
            classifications = [max_classification]
            volume_mb = np.random.lognormal(6, 1)  # Smaller volumes
            num_files = np.random.poisson(10) + 1

        # Determine burning campuses
        burn_campuses = set()

        # Decide how many campuses to burn from
        if is_malicious and np.random.random() < 0.25:
            num_burn_campuses = np.random.choice([1, 2, 3], p=[0.6, 0.3, 0.1])
        else:
            num_burn_campuses = 1

        # Select campuses
        if num_burn_campuses == 1:
            if np.random.random() < 0.85:  # 85% chance own campus
                burn_campuses.add(employee_campus)
            else:
                burn_campuses.add(np.random.choice(self.campuses))
        else:
            burn_campuses.add(employee_campus)
            other_campuses = [c for c in self.campuses if c != employee_campus]
            additional_campuses = np.random.choice(other_campuses, size=min(num_burn_campuses-1, len(other_campuses)), replace=False)
            burn_campuses.update(additional_campuses)

        # Check if burned from other campus
        burned_from_other = 1 if any(campus != employee_campus for campus in burn_campuses) else 0

        # Off-hours burning (more suspicious)
        off_hours_requests = 0
        if is_malicious and np.random.random() < 0.6:
            off_hours_requests = max(0, int(num_requests * np.random.uniform(0.3, 0.8)))
        elif np.random.random() < 0.05:
            off_hours_requests = max(0, int(num_requests * 0.2))

        return {
            'num_burn_requests': num_requests,
            'max_request_classification': max(classifications),
            'avg_request_classification': np.mean(classifications),
            'num_burn_requests_off_hours': off_hours_requests,
            'total_burn_volume_mb': int(volume_mb),
            'total_files_burned': num_files,
            'burned_from_other': burned_from_other,
            'burn_campuses': len(burn_campuses)
        }

    def _empty_burn_activity(self):
        """Return empty burn activity"""
        return {
            'num_burn_requests': 0,
            'max_request_classification': 0,
            'avg_request_classification': 0,
            'num_burn_requests_off_hours': 0,
            'total_burn_volume_mb': 0,
            'total_files_burned': 0,
            'burned_from_other': 0,
            'burn_campuses': 0
        }

    def _update_trip_status(self, emp_id, date):
        """Update trip status for employee"""
        # Check if employee has ongoing trip
        if emp_id in self.employee_trips:
            trip = self.employee_trips[emp_id]
            days_since_start = (date - trip['start_date']).days

            if days_since_start < trip['duration']:
                # Still on trip
                return {
                    'is_abroad': 1,
                    'trip_day_number': days_since_start + 1,
                    'country_name': trip['country'],
                    'is_hostile_country_trip': 1 if trip['country'] in self.hostile_countries else 0,
                    'is_official_trip': trip['is_official'],
                    'is_origin_country_trip': 1 if trip['country'] == self.employees[emp_id]['origin_country'] else 0
                }
            else:
                # Trip ended
                del self.employee_trips[emp_id]
                return self._no_travel_activity()

        # Check if starting new trip
        if np.random.random() < 0.005:  # 0.5% chance of starting trip each day
            return self._start_new_trip(emp_id, date)

        return self._no_travel_activity()

    def _start_new_trip(self, emp_id, date):
        """Start new trip for employee"""
        is_malicious = emp_id in self.malicious_employee_ids

        # Select destination
        if is_malicious and np.random.random() < 0.3:
            # Malicious more likely to visit hostile countries
            country = np.random.choice(self.hostile_countries)
        else:
            # Normal distribution of countries
            country = np.random.choice(self.countries)

        # Determine if official trip
        is_official = np.random.choice([0, 1], p=[0.3, 0.7])  # Most trips are official

        # Check if visiting origin country
        origin_country = self.employees[emp_id]['origin_country']
        is_origin_trip = 1 if country == origin_country else 0

        # If visiting origin country, more likely to be personal
        if is_origin_trip and np.random.random() < 0.6:
            is_official = 0

        # Trip duration
        duration = np.random.randint(1, 15)  # 1-14 days

        # Store trip info
        self.employee_trips[emp_id] = {
            'country': country,
            'start_date': date,
            'duration': duration,
            'is_official': is_official
        }

        return {
            'is_abroad': 1,
            'trip_day_number': 1,
            'country_name': country,
            'is_hostile_country_trip': 1 if country in self.hostile_countries else 0,
            'is_official_trip': is_official,
            'is_origin_country_trip': is_origin_trip
        }

    def _no_travel_activity(self):
        """Return no travel activity"""
        return {
            'is_abroad': 0,
            'trip_day_number': None,
            'country_name': None,
            'is_hostile_country_trip': 0,
            'is_official_trip': 0,
            'is_origin_country_trip': 0
        }

    def _generate_access_activity(self, emp_id, date, is_malicious, is_abroad):
        """Generate building access activity"""

        # If employee is abroad, very low chance of access (suspicious if happens)
        if is_abroad:
            if is_malicious and np.random.random() < 0.05:  # 5% chance - credential theft
                pass  # Generate suspicious access
            elif not is_malicious and np.random.random() < 0.001:  # 0.1% chance - error/family access
                pass  # Generate rare access
            else:
                return self._empty_access_activity()

        # Normal absence chance
        if np.random.random() < 0.05:  # 5% chance of no access (sick day, vacation)
            return self._empty_access_activity()

        base_date = datetime.combine(date, datetime.min.time())

        # Normal work patterns
        if not is_malicious:
            num_entries = np.random.choice([1, 2, 3], p=[0.7, 0.25, 0.05])
            first_entry = base_date + timedelta(hours=np.random.normal(8, 1))  # Around 8 AM
            last_exit = base_date + timedelta(hours=np.random.normal(17, 1.5))  # Around 5 PM
        else:
            # Malicious employees might have irregular hours
            num_entries = np.random.choice([1, 2, 3, 4], p=[0.5, 0.3, 0.15, 0.05])
            if np.random.random() < 0.3:  # Sometimes very early or very late
                first_entry = base_date + timedelta(hours=np.random.uniform(5, 7))
                last_exit = base_date + timedelta(hours=np.random.uniform(19, 23))
            else:
                first_entry = base_date + timedelta(hours=np.random.normal(8, 1.5))
                last_exit = base_date + timedelta(hours=np.random.normal(17, 2))

        # Ensure logical order
        if last_exit <= first_entry:
            last_exit = first_entry + timedelta(hours=np.random.uniform(4, 10))

        num_exits = num_entries  # Assume balanced entries/exits

        # Calculate presence time
        total_minutes = int((last_exit - first_entry).total_seconds() / 60)

        # Flags
        early_entry = 1 if first_entry.hour < 6 else 0
        late_exit = 1 if last_exit.hour > 22 else 0
        night_entry = 1 if self._is_night_hours(first_entry) else 0
        weekend_entry = 1 if self._is_weekend(first_entry) else 0

        # Multiple campus access (suspicious for malicious)
        employee_campus = self.employees[emp_id]['campus']
        if is_malicious and np.random.random() < 0.2:
            num_unique_campus = np.random.choice([2, 3])
        else:
            num_unique_campus = 1

        return {
            'num_entries': num_entries,
            'num_exits': num_exits,
            'first_entry_time': first_entry.strftime('%H:%M'),
            'last_exit_time': last_exit.strftime('%H:%M'),
            'total_presence_minutes': total_minutes,
            'entered_during_night_hours': night_entry,
            'num_unique_campus': num_unique_campus,
            'early_entry_flag': early_entry,
            'late_exit_flag': late_exit,
            'entry_during_weekend': weekend_entry
        }

    def _empty_access_activity(self):
        """Return empty access activity"""
        return {
            'num_entries': 0,
            'num_exits': 0,
            'first_entry_time': None,
            'last_exit_time': None,
            'total_presence_minutes': 0,
            'entered_during_night_hours': 0,
            'num_unique_campus': 0,
            'early_entry_flag': 0,
            'late_exit_flag': 0,
            'entry_during_weekend': 0
        }

    def generate_dataset(self):
        """Generate the complete dataset"""
        print(f"Generating dataset with {self.num_employees} employees over {self.days_range} days...")
        print(f"Malicious employees: {self.malicious_employees} ({self.malicious_ratio:.1%})")

        data = []
        start_date = datetime.now() - timedelta(days=self.days_range)

        for emp_id in list(self.employees.keys()):
            is_malicious = emp_id in self.malicious_employee_ids

            for day in range(self.days_range):
                current_date = start_date + timedelta(days=day)

                # Employee static info
                emp_info = self.employees[emp_id]

                # Update travel status
                travel_activity = self._update_trip_status(emp_id, current_date.date())
                is_abroad = travel_activity['is_abroad'] == 1

                # Generate daily activities
                print_activity = self._generate_print_activity(emp_id, current_date.date(), is_malicious, is_abroad)
                burn_activity = self._generate_burn_activity(emp_id, current_date.date(), is_malicious, is_abroad)
                access_activity = self._generate_access_activity(emp_id, current_date.date(), is_malicious, is_abroad)

                # Calculate risk travel indicator
                risk_travel_indicator = 0
                if (travel_activity['is_abroad'] == 1 and
                    travel_activity['is_official_trip'] == 0 and
                    travel_activity['is_hostile_country_trip'] == 1 and
                    (print_activity['total_printed_pages'] > 0 or burn_activity['total_files_burned'] > 0)):
                    risk_travel_indicator = 1

                # Combine all data
                row = {
                    'employee_id': emp_id,
                    'date': current_date.date(),
                    'employee_department': emp_info['department'],
                    'employee_campus': emp_info['campus'],
                    'employee_job': emp_info['job'],
                    'employee_seniority_years': emp_info['seniority_years'],
                    'is_contractor': emp_info['is_contractor'],
                    'employee_classification': emp_info['classification'],
                    'has_foreign_citizenship': emp_info['foreign_citizenship'],
                    'has_criminal_record': emp_info['criminal_record'],
                    'has_medical_history': emp_info['medical_history'],
                    'employee_origin_country': emp_info['origin_country'],
                    'is_malicious': 1 if is_malicious else 0,  # Target variable
                    'risk_travel_indicator': risk_travel_indicator
                }

                # Add all activity data
                row.update(print_activity)
                row.update(burn_activity)
                row.update(travel_activity)
                row.update(access_activity)

                data.append(row)

        df = pd.DataFrame(data)

        # Convert trip_day_number to int where not null
        df['trip_day_number'] = df['trip_day_number'].astype('Int64')  # Nullable integer

        print(f"Dataset generated: {len(df)} records")
        return df

# Generate dataset and perform EDA
if __name__ == "__main__":
    generator = InsiderThreatDataGenerator(num_employees=1000, days_range=180)
    df = generator.generate_dataset()

Generating dataset with 1000 employees over 180 days...
Malicious employees: 70 (7.0%)
Dataset generated: 180000 records


In [2]:
    print("\n=== EDA FOR CHANGES VERIFICATION ===")

    # 1. Check employee ID format
    print("\n1. Employee ID Format:")
    print(f"Sample employee IDs: {df['employee_id'].head(10).tolist()}")
    print(f"All IDs have same length: {df['employee_id'].str.len().nunique() == 1}")
    print(f"ID length: {df['employee_id'].str.len().iloc[0]} digits")

    # 2. Check seniority years range
    print("\n2. Employee Seniority Years:")
    print(f"Min seniority: {df['employee_seniority_years'].min()}")
    print(f"Max seniority: {df['employee_seniority_years'].max()}")
    print(f"Range verification (0-50): {df['employee_seniority_years'].min() >= 0 and df['employee_seniority_years'].max() <= 50}")

    # 3. Check English translations
    print("\n3. English Department/Job Names:")
    print(f"Departments: {df['employee_department'].unique()}")
    print(f"Jobs: {df['employee_job'].unique()[:5]}...")  # Show first 5

    # 4. Check trip_day_number data type and sequence
    print("\n4. Trip Day Number Analysis:")
    print(f"Data type: {df['trip_day_number'].dtype}")
    abroad_data = df[df['is_abroad'] == 1].copy()
    if len(abroad_data) > 0:
        print(f"Trip day number range: {abroad_data['trip_day_number'].min()} to {abroad_data['trip_day_number'].max()}")

        # Check sequence for one employee
        sample_emp = abroad_data['employee_id'].iloc[0]
        emp_trips = abroad_data[abroad_data['employee_id'] == sample_emp].sort_values('date')
        print(f"Sample trip sequence for employee {sample_emp}:")
        print(emp_trips[['date', 'trip_day_number', 'country_name']].head(10))

    # 5. Check access data when abroad
    print("\n5. Access Activity When Abroad:")
    abroad_with_access = df[(df['is_abroad'] == 1) & (df['total_presence_minutes'] > 0)]
    print(f"Records with access while abroad: {len(abroad_with_access)} ({len(abroad_with_access)/len(df[df['is_abroad']==1])*100:.2f}% of abroad days)")

    # 6. Check print campus columns
    print("\n6. Print Campus Analysis:")
    print(f"printed_from_other column exists: {'printed_from_other' in df.columns}")
    print(f"print_campuses column exists: {'print_campuses' in df.columns}")
    print(f"print_location_campus column removed: {'print_location_campus' not in df.columns}")
    if 'print_campuses' in df.columns:
        print(f"Print campuses range: {df['print_campuses'].min()} to {df['print_campuses'].max()}")
        print(f"Printed from other campus distribution: {df['printed_from_other'].value_counts()}")

    # 7. Check burn campus columns
    print("\n7. Burn Campus Analysis:")
    print(f"burned_from_other column exists: {'burned_from_other' in df.columns}")
    print(f"burn_campuses column exists: {'burn_campuses' in df.columns}")
    print(f"burn_location_campus column removed: {'burn_location_campus' not in df.columns}")
    if 'burn_campuses' in df.columns:
        print(f"Burn campuses range: {df['burn_campuses'].min()} to {df['burn_campuses'].max()}")
        print(f"Burned from other campus distribution: {df['burned_from_other'].value_counts()}")

    # 8. Check origin country trip
    print("\n8. Origin Country Trip Analysis:")
    print(f"is_origin_country_trip column exists: {'is_origin_country_trip' in df.columns}")
    if 'is_origin_country_trip' in df.columns:
        origin_trips = df[df['is_origin_country_trip'] == 1]
        print(f"Origin country trips: {len(origin_trips)} records")
        if len(origin_trips) > 0:
            print("Sample origin country trips:")
            print(origin_trips[['employee_id', 'employee_origin_country', 'country_name', 'is_origin_country_trip']].head())

    # 9. Risk Travel Indicator Analysis (continuing from where it was cut off)
    print("\n9. Risk Travel Indicator Analysis:")
    print(f"risk_travel_indicator column exists: {'risk_travel_indicator' in df.columns}")
    if 'risk_travel_indicator' in df.columns:
        risk_cases = df[df['risk_travel_indicator'] == 1]
        print(f"High-risk travel cases: {len(risk_cases)} records")
        print(f"Risk travel indicator distribution: {df['risk_travel_indicator'].value_counts()}")

        if len(risk_cases) > 0:
            print("Sample high-risk travel cases (unofficial trips to hostile countries with activity):")
            print(risk_cases[['employee_id', 'country_name', 'is_official_trip', 'is_hostile_country_trip',
                             'total_printed_pages', 'total_files_burned']].head())

    # 10. Verify trip day sequence integrity
    print("\n10. Trip Day Sequence Integrity Check:")
    abroad_employees = df[df['is_abroad'] == 1]
    if len(abroad_employees) > 0:
        sequence_errors = []
        for emp_id in abroad_employees['employee_id'].unique()[:5]:  # Check first 5 employees
            emp_abroad = abroad_employees[abroad_employees['employee_id'] == emp_id].sort_values('date')
            for i in range(1, len(emp_abroad)):
                current_day = emp_abroad.iloc[i]['trip_day_number']
                prev_day = emp_abroad.iloc[i-1]['trip_day_number']
                date_diff = (emp_abroad.iloc[i]['date'] - emp_abroad.iloc[i-1]['date']).days

                # Check if sequence is correct
                if date_diff == 1 and current_day != prev_day + 1:
                    sequence_errors.append(f"Employee {emp_id}: Day {prev_day} -> {current_day}")

        print(f"Trip sequence errors found: {len(sequence_errors)}")
        if sequence_errors:
            print("Sample errors:", sequence_errors[:3])
        else:
            print("✓ Trip day sequences are correct")

    # 11. Cross-validation: Abroad employees with building access
    print("\n11. Suspicious Access While Abroad Analysis:")
    abroad_with_access = df[(df['is_abroad'] == 1) & (df['total_presence_minutes'] > 0)]
    if len(abroad_with_access) > 0:
        print(f"Employees with building access while abroad: {len(abroad_with_access)}")
        print("Breakdown by malicious status:")
        print(abroad_with_access.groupby('is_malicious').size())

        print("\nSample suspicious cases:")
        print(abroad_with_access[['employee_id', 'date', 'country_name', 'trip_day_number',
                                 'total_presence_minutes', 'is_malicious']].head())
    else:
        print("No cases of building access while abroad found")

    # 12. Multi-campus activity analysis
    print("\n12. Multi-Campus Activity Analysis:")

    # Print activity across campuses
    multi_print_campus = df[df['print_campuses'] > 1]
    print(f"Records with multi-campus printing: {len(multi_print_campus)}")
    if len(multi_print_campus) > 0:
        print("Multi-campus printing by malicious status:")
        print(multi_print_campus.groupby('is_malicious')['print_campuses'].describe())

    # Burn activity across campuses
    multi_burn_campus = df[df['burn_campuses'] > 1]
    print(f"Records with multi-campus burning: {len(multi_burn_campus)}")
    if len(multi_burn_campus) > 0:
        print("Multi-campus burning by malicious status:")
        print(multi_burn_campus.groupby('is_malicious')['burn_campuses'].describe())

    # 13. Activity from other campus analysis
    print("\n13. Activity From Other Campus Analysis:")
    print("Printing from other campus:")
    print(df['printed_from_other'].value_counts())
    print(f"Percentage printing from other campus: {df['printed_from_other'].mean()*100:.2f}%")

    print("\nBurning from other campus:")
    print(df['burned_from_other'].value_counts())
    print(f"Percentage burning from other campus: {df['burned_from_other'].mean()*100:.2f}%")

    # Cross-tabulation with malicious status
    print("\nPrint from other campus by malicious status:")
    print(pd.crosstab(df['is_malicious'], df['printed_from_other'], margins=True))

    print("\nBurn from other campus by malicious status:")
    print(pd.crosstab(df['is_malicious'], df['burned_from_other'], margins=True))

    # 14. Verify data types and ranges
    print("\n14. Final Data Type and Range Verification:")

    # Check all integer columns are within expected ranges
    int_columns_ranges = {
        'employee_seniority_years': (0, 50),
        'print_campuses': (0, 3),
        'burn_campuses': (0, 3),
        'printed_from_other': (0, 1),
        'burned_from_other': (0, 1),
        'is_abroad': (0, 1),
        'is_malicious': (0, 1)
    }

    for col, (min_val, max_val) in int_columns_ranges.items():
        if col in df.columns:
            actual_min = df[col].min()
            actual_max = df[col].max()
            is_valid = actual_min >= min_val and actual_max <= max_val
            print(f"{col}: Range {actual_min}-{actual_max}, Expected {min_val}-{max_val}, Valid: {'✓' if is_valid else '✗'}")

    # Check trip_day_number specifically
    if 'trip_day_number' in df.columns:
        trip_data = df[df['trip_day_number'].notna()]
        if len(trip_data) > 0:
            print(f"trip_day_number: Range {trip_data['trip_day_number'].min()}-{trip_data['trip_day_number'].max()}, Type: {df['trip_day_number'].dtype}")

    # 15. Summary statistics for new/modified columns
    print("\n15. Summary Statistics for Key Columns:")

    key_columns = ['employee_seniority_years', 'print_campuses', 'burn_campuses',
                   'printed_from_other', 'burned_from_other', 'trip_day_number']

    for col in key_columns:
        if col in df.columns:
            print(f"\n{col}:")
            if col == 'trip_day_number':
                # Handle nullable integer
                trip_stats = df[df['trip_day_number'].notna()]['trip_day_number']
                if len(trip_stats) > 0:
                    print(f"  Count (non-null): {len(trip_stats)}")
                    print(f"  Mean: {trip_stats.mean():.2f}")
                    print(f"  Std: {trip_stats.std():.2f}")
                    print(f"  Min: {trip_stats.min()}")
                    print(f"  Max: {trip_stats.max()}")
            else:
                print(f"  Count: {df[col].count()}")
                print(f"  Mean: {df[col].mean():.4f}")
                print(f"  Std: {df[col].std():.4f}")
                print(f"  Min: {df[col].min()}")
                print(f"  Max: {df[col].max()}")

    print("\n=== EDA VERIFICATION COMPLETE ===")
    print(f"Total records: {len(df)}")
    print(f"Total employees: {df['employee_id'].nunique()}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"Malicious employees: {df[df['is_malicious']==1]['employee_id'].nunique()}")

    # Save dataset
    df.to_csv('insider_threat_dataset.csv', index=False)
    print(f"\nDataset saved as 'insider_threat_dataset.csv'")

    # Display sample of final dataset
    print("\nSample of final dataset:")
    display_cols = ['employee_id', 'date', 'employee_department', 'employee_seniority_years',
                    'is_abroad', 'trip_day_number', 'print_campuses', 'printed_from_other',
                    'burn_campuses', 'burned_from_other', 'is_malicious']
    available_cols = [col for col in display_cols if col in df.columns]
    print(df[available_cols].head(10))


=== EDA FOR CHANGES VERIFICATION ===

1. Employee ID Format:
Sample employee IDs: ['0001', '0001', '0001', '0001', '0001', '0001', '0001', '0001', '0001', '0001']
All IDs have same length: True
ID length: 4 digits

2. Employee Seniority Years:
Min seniority: 0
Max seniority: 50
Range verification (0-50): True

3. English Department/Job Names:
Departments: [np.str_('Marketing') np.str_('R&D') np.str_('Finance') np.str_('HR')
 np.str_('Technology') np.str_('Security') np.str_('Sales')
 np.str_('Legal')]
Jobs: [np.str_('Analyst') np.str_('Consultant') np.str_('Developer')
 np.str_('Project Manager') np.str_('Manager')]...

4. Trip Day Number Analysis:
Data type: Int64
Trip day number range: 1 to 14
Sample trip sequence for employee 0001:
          date  trip_day_number country_name
72  2025-02-26                1        Syria
73  2025-02-27                2        Syria
74  2025-02-28                3        Syria
75  2025-03-01                4        Syria
76  2025-03-02               