In [1]:
"""
Implementing Existing Research Methodologies on Your Survey Data
This addresses your supervisor's requirement to compare against actual research systems
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE, ADASYN
import lightgbm as lgb
import xgboost as xgb

class ExistingSystemsComparator:
    """
    Compare your proposed method against existing research methodologies
    using YOUR survey data consistently across all methods
    """
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.X, self.y = self._load_your_survey_data()
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.results = {}
        
    def _load_your_survey_data(self):
        """Load and preprocess YOUR survey data consistently"""
        df = pd.read_csv(self.data_path)
        
        # Your survey data preprocessing
        df['encountered_threat_binary'] = df['encountered_threat'].map({
            'Yes': 1, 'No': 0, 'Not sure': 0
        })
        
        # Select relevant features from your survey
        features = [
            'remote_experience', 'work_location', 'flexible_hours', 
            'data_access_frequency', 'threat_concern', 
            'detection_confidence', 'tools_effectiveness'
        ]
        
        # Add categorical features if they exist in your data
        categorical_features = [
            'industry_Finance', 'industry_Healthcare', 'industry_Technology'
        ]
        
        # Only include features that exist in your dataset
        available_features = []
        for feature in features + categorical_features:
            if feature in df.columns:
                available_features.append(feature)
        
        X = df[available_features].fillna(0)
        y = df['encountered_threat_binary']
        
        print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
        print(f"Threat distribution: {y.value_counts().to_dict()}")
        
        return X, y
    
    def prepare_data_split(self, test_size=0.2, random_state=42):
        """Prepare consistent train/test split for all methods"""
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state, 
            stratify=self.y if self.y.nunique() > 1 else None
        )
        
        print(f"Train set: {self.X_train.shape[0]} samples")
        print(f"Test set: {self.X_test.shape[0]} samples")
        print(f"Train threats: {self.y_train.sum()}/{len(self.y_train)}")
        print(f"Test threats: {self.y_test.sum()}/{len(self.y_test)}")

    def implement_han_et_al_method(self):
        """
        Implementation of Han et al. (2023) methodology
        "A Study on Detection of Malicious Behavior Based on Host Process Data Using Machine Learning"
        
        Key aspects: KNN, NB, RF with SMOTE preprocessing and PCA
        """
        print("\n=== Implementing Han et al. (2023) Method ===")
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)
        
        # Apply SMOTE preprocessing as described in their paper
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, self.y_train)
        
        print(f"After SMOTE: {X_train_smote.shape[0]} samples")
        print(f"Class distribution: {np.bincount(y_train_smote)}")
        
        # Apply PCA dimensionality reduction
        pca = PCA(n_components=0.95, random_state=42)
        X_train_pca = pca.fit_transform(X_train_smote)
        X_test_pca = pca.transform(X_test_scaled)
        
        print(f"PCA components: {X_train_pca.shape[1]} (from {X_train_scaled.shape[1]})")
        
        # Models from Han et al. paper
        han_models = {
            'KNN_Han_et_al': KNeighborsClassifier(n_neighbors=5),
            'NaiveBayes_Han_et_al': GaussianNB(),
            'RandomForest_Han_et_al': RandomForestClassifier(n_estimators=100, random_state=42)
        }
        
        han_results = {}
        
        for model_name, model in han_models.items():
            print(f"Training {model_name}...")
            
            model.fit(X_train_pca, y_train_smote)
            y_pred = model.predict(X_test_pca)
            
            # Calculate metrics
            metrics = self._calculate_metrics(self.y_test, y_pred)
            han_results[model_name] = metrics
            
            print(f"  Results: Acc={metrics['accuracy']:.3f}, "
                  f"Prec={metrics['precision']:.3f}, "
                  f"Rec={metrics['recall']:.3f}, "
                  f"F1={metrics['f1']:.3f}")
        
        self.results.update(han_results)
        return han_results
    
    def implement_janjua_et_al_method(self):
        """
        Implementation of Janjua et al. (2020) methodology
        "Handling insider threat through supervised machine learning techniques"
        
        Key aspects: AdaBoost, SVM, NB, LR, KNN for behavioral classification
        """
        print("\n=== Implementing Janjua et al. (2020) Method ===")
        
        # Scale features as per their methodology
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)
        
        # Models from Janjua et al. paper
        janjua_models = {
            'AdaBoost_Janjua_et_al': AdaBoostClassifier(n_estimators=100, random_state=42),
            'SVM_Janjua_et_al': SVC(kernel='rbf', random_state=42),
            'NaiveBayes_Janjua_et_al': GaussianNB(),
            'KNN_Janjua_et_al': KNeighborsClassifier(n_neighbors=5)
        }
        
        janjua_results = {}
        
        for model_name, model in janjua_models.items():
            print(f"Training {model_name}...")
            
            model.fit(X_train_scaled, self.y_train)
            y_pred = model.predict(X_test_scaled)
            
            # Calculate metrics
            metrics = self._calculate_metrics(self.y_test, y_pred)
            janjua_results[model_name] = metrics
            
            print(f"  Results: Acc={metrics['accuracy']:.3f}, "
                  f"Prec={metrics['precision']:.3f}, "
                  f"Rec={metrics['recall']:.3f}, "
                  f"F1={metrics['f1']:.3f}")
        
        self.results.update(janjua_results)
        return janjua_results
    
    def implement_mehmood_et_al_method(self):
        """
        Implementation of Mehmood et al. (2023) methodology
        "Privilege Escalation Attack Detection and Mitigation in Cloud using Machine Learning"
        
        Key aspects: Ensemble learning with RF, AdaBoost, XGBoost, LightGBM
        """
        print("\n=== Implementing Mehmood et al. (2023) Method ===")
        
        # Models from Mehmood et al. paper with their reported performance focus
        mehmood_models = {
            'RandomForest_Mehmood_et_al': RandomForestClassifier(n_estimators=100, random_state=42),
            'AdaBoost_Mehmood_et_al': AdaBoostClassifier(n_estimators=100, random_state=42),
            'XGBoost_Mehmood_et_al': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
            'LightGBM_Mehmood_et_al': lgb.LGBMClassifier(random_state=42, verbose=-1)
        }
        
        mehmood_results = {}
        
        for model_name, model in mehmood_models.items():
            print(f"Training {model_name}...")
            
            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            
            # Calculate metrics
            metrics = self._calculate_metrics(self.y_test, y_pred)
            mehmood_results[model_name] = metrics
            
            print(f"  Results: Acc={metrics['accuracy']:.3f}, "
                  f"Prec={metrics['precision']:.3f}, "
                  f"Rec={metrics['recall']:.3f}, "
                  f"F1={metrics['f1']:.3f}")
        
        self.results.update(mehmood_results)
        return mehmood_results
    
    def implement_your_proposed_method(self):
        """
        Implement YOUR proposed hybrid ensemble method
        This should be your novel contribution combining supervised + unsupervised + deep learning
        """
        print("\n=== Implementing Your Proposed Method ===")
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)
        
        # Your hybrid approach components
        # Component 1: Supervised learning
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train_scaled, self.y_train)
        rf_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]
        
        # Component 2: Ensemble of multiple algorithms
        gb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
        gb_model.fit(X_train_scaled, self.y_train)
        gb_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1]
        
        # Component 3: AdaBoost for behavioral patterns
        ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
        ada_model.fit(X_train_scaled, self.y_train)
        ada_pred_proba = ada_model.predict_proba(X_test_scaled)[:, 1]
        
        # Hybrid ensemble combination (your novel approach)
        # Weighted combination based on performance
        ensemble_proba = (0.4 * rf_pred_proba + 0.35 * gb_pred_proba + 0.25 * ada_pred_proba)
        
        # Convert to binary predictions
        threshold = 0.5
        ensemble_pred = (ensemble_proba > threshold).astype(int)
        
        # Calculate metrics
        metrics = self._calculate_metrics(self.y_test, ensemble_pred)
        
        proposed_results = {'Proposed_Hybrid_Model': metrics}
        
        print(f"Proposed Model Results: Acc={metrics['accuracy']:.3f}, "
              f"Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, "
              f"F1={metrics['f1']:.3f}")
        
        self.results.update(proposed_results)
        return proposed_results
    
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate standard evaluation metrics"""
        return {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1': f1_score(y_true, y_pred, zero_division=0)
        }
    
    def create_comprehensive_comparison_table(self):
        """Create the comparison table your supervisor wants"""
        print("\n=== COMPREHENSIVE COMPARISON TABLE ===")
        print("Comparing Existing Research Methods vs Your Proposed Method")
        print("All methods tested on YOUR survey data for fair comparison\n")
        
        # Organize results by paper/method
        comparison_data = []
        
        for method_name, metrics in self.results.items():
            # Extract paper reference
            if 'Han_et_al' in method_name:
                paper_ref = 'Han et al. (2023)'
            elif 'Janjua_et_al' in method_name:
                paper_ref = 'Janjua et al. (2020)'
            elif 'Mehmood_et_al' in method_name:
                paper_ref = 'Mehmood et al. (2023)'
            elif 'Proposed' in method_name:
                paper_ref = 'Your Research (2024)'
            else:
                paper_ref = 'Other Method'
            
            comparison_data.append({
                'Paper_Reference': paper_ref,
                'Method_Name': method_name,
                'Accuracy': metrics['accuracy'],
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1_Score': metrics['f1']
            })
        
        # Create DataFrame and sort by F1-score
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values('F1_Score', ascending=False)
        
        # Display formatted table
        print(comparison_df.to_string(index=False, float_format='%.3f'))
        
        # Highlight your method's performance
        your_method = comparison_df[comparison_df['Method_Name'].str.contains('Proposed')]
        if not your_method.empty:
            your_rank = comparison_df.index[comparison_df['Method_Name'].str.contains('Proposed')].tolist()[0] + 1
            print(f"\nYour Proposed Method Ranking: {your_rank} out of {len(comparison_df)} methods")
            print(f"Your F1-Score: {your_method.iloc[0]['F1_Score']:.3f}")
        
        return comparison_df
    
    def run_complete_comparison(self):
        """Run complete comparison as required by your supervisor"""
        print("=== RUNNING COMPLETE EXISTING SYSTEMS COMPARISON ===")
        print("This implements actual research methodologies from your literature review")
        print("on YOUR survey data for fair comparison\n")
        
        # Prepare data
        self.prepare_data_split()
        
        # Run all existing methods
        han_results = self.implement_han_et_al_method()
        janjua_results = self.implement_janjua_et_al_method()  
        mehmood_results = self.implement_mehmood_et_al_method()
        
        # Run your proposed method
        proposed_results = self.implement_your_proposed_method()
        
        # Create comprehensive comparison
        comparison_table = self.create_comprehensive_comparison_table()
        
        return comparison_table, self.results



In [2]:

"""
Main function to run the complete existing systems comparison
"""
# Replace with your actual data path
DATA_PATH = "preprocessed_data.csv"
    



In [3]:
    # Initialize comparator
comparator = ExistingSystemsComparator(DATA_PATH)
    


Dataset loaded: 400 samples, 10 features
Threat distribution: {0: 331, 1: 69}


In [4]:
    # Run complete comparison
comparison_table, all_results = comparator.run_complete_comparison()
    



=== RUNNING COMPLETE EXISTING SYSTEMS COMPARISON ===
This implements actual research methodologies from your literature review
on YOUR survey data for fair comparison

Train set: 320 samples
Test set: 80 samples
Train threats: 55/320
Test threats: 14/80

=== Implementing Han et al. (2023) Method ===
After SMOTE: 530 samples
Class distribution: [265 265]
PCA components: 10 (from 10)
Training KNN_Han_et_al...
  Results: Acc=0.562, Prec=0.138, Rec=0.286, F1=0.186
Training NaiveBayes_Han_et_al...
  Results: Acc=0.525, Prec=0.071, Rec=0.143, F1=0.095
Training RandomForest_Han_et_al...
  Results: Acc=0.775, Prec=0.300, Rec=0.214, F1=0.250

=== Implementing Janjua et al. (2020) Method ===
Training AdaBoost_Janjua_et_al...
  Results: Acc=0.812, Prec=0.000, Rec=0.000, F1=0.000
Training SVM_Janjua_et_al...
  Results: Acc=0.825, Prec=0.000, Rec=0.000, F1=0.000
Training NaiveBayes_Janjua_et_al...
  Results: Acc=0.750, Prec=0.000, Rec=0.000, F1=0.000
Training KNN_Janjua_et_al...
  Results: Acc=0.80



Proposed Model Results: Acc=0.800, Prec=0.000, Rec=0.000, F1=0.000

=== COMPREHENSIVE COMPARISON TABLE ===
Comparing Existing Research Methods vs Your Proposed Method
All methods tested on YOUR survey data for fair comparison

      Paper_Reference                Method_Name  Accuracy  Precision  Recall  F1_Score
    Han et al. (2023)     RandomForest_Han_et_al     0.775      0.300   0.214     0.250
    Han et al. (2023)              KNN_Han_et_al     0.562      0.138   0.286     0.186
Mehmood et al. (2023)      XGBoost_Mehmood_et_al     0.800      0.250   0.071     0.111
Mehmood et al. (2023)     LightGBM_Mehmood_et_al     0.800      0.250   0.071     0.111
    Han et al. (2023)       NaiveBayes_Han_et_al     0.525      0.071   0.143     0.095
 Janjua et al. (2020)      AdaBoost_Janjua_et_al     0.812      0.000   0.000     0.000
 Janjua et al. (2020)           SVM_Janjua_et_al     0.825      0.000   0.000     0.000
 Janjua et al. (2020)    NaiveBayes_Janjua_et_al     0.750      0.000

In [5]:
    # Save results
 comparison_table.to_csv('existing_systems_comparison.csv', index=False)
    


In [6]:
print(f"\nComparison complete! Results saved to 'existing_systems_comparison.csv'")
print(f"Total methods compared: {len(all_results)}")
    



Comparison complete! Results saved to 'existing_systems_comparison.csv'
Total methods compared: 12


In [7]:
comparison_table

Unnamed: 0,Paper_Reference,Method_Name,Accuracy,Precision,Recall,F1_Score
2,Han et al. (2023),RandomForest_Han_et_al,0.775,0.3,0.214286,0.25
0,Han et al. (2023),KNN_Han_et_al,0.5625,0.137931,0.285714,0.186047
9,Mehmood et al. (2023),XGBoost_Mehmood_et_al,0.8,0.25,0.071429,0.111111
10,Mehmood et al. (2023),LightGBM_Mehmood_et_al,0.8,0.25,0.071429,0.111111
1,Han et al. (2023),NaiveBayes_Han_et_al,0.525,0.071429,0.142857,0.095238
3,Janjua et al. (2020),AdaBoost_Janjua_et_al,0.8125,0.0,0.0,0.0
4,Janjua et al. (2020),SVM_Janjua_et_al,0.825,0.0,0.0,0.0
5,Janjua et al. (2020),NaiveBayes_Janjua_et_al,0.75,0.0,0.0,0.0
6,Janjua et al. (2020),KNN_Janjua_et_al,0.8,0.0,0.0,0.0
7,Mehmood et al. (2023),RandomForest_Mehmood_et_al,0.8125,0.0,0.0,0.0


In [8]:
all_results

{'KNN_Han_et_al': {'accuracy': 0.5625,
  'precision': 0.13793103448275862,
  'recall': 0.2857142857142857,
  'f1': 0.18604651162790697},
 'NaiveBayes_Han_et_al': {'accuracy': 0.525,
  'precision': 0.07142857142857142,
  'recall': 0.14285714285714285,
  'f1': 0.09523809523809523},
 'RandomForest_Han_et_al': {'accuracy': 0.775,
  'precision': 0.3,
  'recall': 0.21428571428571427,
  'f1': 0.25},
 'AdaBoost_Janjua_et_al': {'accuracy': 0.8125,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'SVM_Janjua_et_al': {'accuracy': 0.825,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'NaiveBayes_Janjua_et_al': {'accuracy': 0.75,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'KNN_Janjua_et_al': {'accuracy': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'RandomForest_Mehmood_et_al': {'accuracy': 0.8125,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'AdaBoost_Mehmood_et_al': {'accuracy': 0.8125,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'XGBoost_Mehmood_et_al'