In [1]:
import os
import pandas as pd
import numpy as np

def collect_raw_data(directory):
    """Combine all user files into one DataFrame with user IDs"""
    all_data = []
    
    # Get all data files (assuming format name_data.csv)
    user_files = [f for f in os.listdir(directory) if f.endswith('_data.csv')]
    
    for user_num, filename in enumerate(user_files, start=1):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        df['user_id'] = user_num  # Simple numeric ID
        all_data.append(df)
    
    # Combine all data
    combined = pd.concat(all_data, ignore_index=True)
    
    # Save raw combined data
    combined.to_csv(os.path.join(directory, 'data_combined.csv'), index=False)
    print(f"Combined {len(user_files)} user files. Total records: {len(combined)}")
    
    return combined

# Usage
data_dir = r"C:\Users\User\OneDrive\Desktop\ME\GradProj\IDS_data"
df = collect_raw_data(data_dir)

Combined 7 user files. Total records: 387512


In [3]:
#creating idle time column for periods of inactivity

def create_idle_time(df):
    # Make sure timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort by user and timestamp
    df.sort_values(by=['user_id', 'timestamp'], inplace=True)
    
    # Define which columns are monitored for idle detection
    idle_cols = ['mouse_speed', 'click_freq', 'keystroke_speed', 'keystroke_interval_std', 'mouse_jerk']
    
    # Ensure timestamp is datetime and sorted
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.sort_values(by=['user_id', 'timestamp'], inplace=True)
    
    # Detect zero rows
    is_idle_row = (df[idle_cols] == 0).all(axis=1).astype(int)
    
    # Identify groups of consecutive idle rows
    df['idle_flag'] = is_idle_row
    df['idle_group'] = (df['idle_flag'].diff(1) != 0).cumsum()
    df['is_idle_group'] = df['idle_flag'] == 1
    
    # Container to store new rows
    new_rows = []
    
    # Process idle and active groups
    for _, group in df.groupby(['idle_group']):
        if group['is_idle_group'].iloc[0]:  # Idle group
            if len(group) == 1:
                group['idle_time'] = 0
                new_rows.append(group.iloc[0])
            else:
                first_row = group.iloc[0].copy()
                idle_duration = (group['timestamp'].iloc[-1] - group['timestamp'].iloc[0]).total_seconds()
                first_row['idle_time'] = idle_duration
                new_rows.append(first_row)
        else:  # Active group
            group['idle_time'] = 0
            new_rows.extend(group.to_dict(orient='records'))
    
    # Create new cleaned DataFrame
    idle_df = pd.DataFrame(new_rows)
    
    # Drop helper columns
    idle_df.drop(columns=['idle_flag', 'idle_group', 'is_idle_group', 'user_id'], inplace=True)

    # Return output
    return idle_df

# Usage
idle_df = create_idle_time(df)

# View sample
print(idle_df.head())

                   timestamp  mouse_speed  click_freq  keystroke_speed  \
0 2025-04-18 18:59:52.736587       220.00         0.0              0.0   
1 2025-04-18 18:59:53.741049         0.00         0.0              0.0   
2 2025-04-18 18:59:54.745401         0.00         0.0              0.0   
3 2025-04-18 18:59:55.749134       472.32         0.0              0.0   
4 2025-04-18 18:59:56.753622        12.15         0.0              0.0   

   keystroke_interval_std  mouse_jerk shortcuts_used  new_devices_detected  \
0                     0.0     1248.65            NaN                     3   
1                     0.0      859.42            NaN                     0   
2                     0.0      671.06            NaN                     0   
3                     0.0     3496.70            NaN                     0   
4                     0.0     3499.50            NaN                     0   

   idle_time  
0        0.0  
1        0.0  
2        0.0  
3        0.0  
4        0.

In [4]:
# Handling "shortcuts_used" Feature

def handle_shortcuts(df):
    #  Replace NaN with empty strings
    df['shortcuts_used'] = df['shortcuts_used'].fillna('')
    
    # Split the shortcuts into lists
    df['shortcut_list'] = df['shortcuts_used'].str.split('|')
    
    # Count the number of shortcuts used
    df['shortcut_count'] = df['shortcut_list'].apply(lambda x: len([s for s in x if s]))
    
    # Create binary features for specific keys
    keys = ['Ctrl', 'Alt', 'Shift', 'Win']
    for key in keys:
       df[f'has_{key}'] = df['shortcut_list'].apply(
            lambda shortcuts: int(any(key in shortcut for shortcut in shortcuts))
        )
    
    # Drop the original 'shortcuts_used' column if desired
    handled_df = df.drop(columns=['shortcuts_used','shortcut_list'])

    # Return output
    return handled_df

handled_df = handle_shortcuts(idle_df)
handled_df.head(10)

Unnamed: 0,timestamp,mouse_speed,click_freq,keystroke_speed,keystroke_interval_std,mouse_jerk,new_devices_detected,idle_time,shortcut_count,has_Ctrl,has_Alt,has_Shift,has_Win
0,2025-04-18 18:59:52.736587,220.0,0.0,0.0,0.0,1248.65,3,0.0,0,0,0,0,0
1,2025-04-18 18:59:53.741049,0.0,0.0,0.0,0.0,859.42,0,0.0,0,0,0,0,0
2,2025-04-18 18:59:54.745401,0.0,0.0,0.0,0.0,671.06,0,0.0,0,0,0,0,0
3,2025-04-18 18:59:55.749134,472.32,0.0,0.0,0.0,3496.7,0,0.0,0,0,0,0,0
4,2025-04-18 18:59:56.753622,12.15,0.0,0.0,0.0,3499.5,0,0.0,0,0,0,0,0
5,2025-04-18 18:59:57.757703,301.1,0.0,0.0,0.0,4488.55,0,0.0,0,0,0,0,0
6,2025-04-18 18:59:58.762302,122.54,0.0,0.0,0.0,3512.6,0,0.0,0,0,0,0,0
7,2025-04-18 18:59:59.765315,0.0,0.0,0.0,0.0,3505.82,0,0.0,0,0,0,0,0
8,2025-04-18 19:00:30.639407,36.86,0.0,0.0,0.0,461.92,3,0.0,0,0,0,0,0
9,2025-04-18 19:00:31.643333,69.13,0.0,0.0,0.0,653.67,0,0.0,0,0,0,0,0


In [5]:
# Adding 3 engineered features " mouse_stability_index, interaction_density, shortcut_usage_rate"

def extract_mousejack_features(df):
    
    # Sort by timestamp just in case
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    
    # Time differences in seconds
    df['time_diff'] = df['timestamp'].diff().dt.total_seconds().fillna(1)

    # 1. Mouse Stability Index = speed / (jerk + 1)
    df['mouse_stability_index'] = df['mouse_speed'] / (df['mouse_jerk'] + 1)

    # 2. Interaction Density = (clicks + keystroke speed) / time diff
    df['interaction_density'] = (df['click_freq'] + df['keystroke_speed']) / df['time_diff']

    # 3. Shortcut Usage Rate = shortcut_count / time diff
    df['shortcut_usage_rate'] = df['shortcut_count'] / df['time_diff']

    # Optional cleanup
    df.drop(columns=['time_diff'], inplace=True)

    return df

features_df = extract_mousejack_features(handled_df)

In [6]:
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#normalize and scale data 

def preprocess_data(df):
    # Column groups
    log_and_scale_cols = [
        'shortcut_count', 'idle_time', 'keystroke_speed',
        'keystroke_interval_std', 'click_freq', 'mouse_speed', 'mouse_jerk'
    ]
    scale_only_cols = ['new_devices_detected']
    binary_cols = ['has_Ctrl', 'has_Alt', 'has_Shift', 'has_Win']

    # Pipelines
    log_minmax_std_pipeline = Pipeline([
        ('log', FunctionTransformer(np.log1p, validate=True)),
        ('minmax', MinMaxScaler()),
        ('std', StandardScaler())
    ])

    minmax_std_pipeline = Pipeline([
        ('minmax', MinMaxScaler()),
        ('std', StandardScaler())
    ])

    # Column transformer
    preprocessor = ColumnTransformer(transformers=[
        ('log_minmax_std', log_minmax_std_pipeline, log_and_scale_cols),
        ('minmax_std', minmax_std_pipeline, scale_only_cols)
    ], remainder='passthrough')  # binary cols passed untouched

    # Fit and transform
    feature_cols = log_and_scale_cols + scale_only_cols + binary_cols
    transformed = preprocessor.fit_transform(df[feature_cols])
    df_transformed = pd.DataFrame(transformed, columns=feature_cols)

    return df_transformed

df_transformed = preprocess_data(features_df)
df_transformed.head(10)

Unnamed: 0,shortcut_count,idle_time,keystroke_speed,keystroke_interval_std,click_freq,mouse_speed,mouse_jerk,new_devices_detected,has_Ctrl,has_Alt,has_Shift,has_Win
0,-0.157134,-0.172516,-0.410017,-0.202595,-0.430271,1.115133,0.830103,42.292414,0.0,0.0,0.0,0.0
1,-0.157134,-0.172516,-0.410017,-0.202595,2.054642,0.370027,0.721463,-0.018449,0.0,0.0,0.0,0.0
2,-0.157134,-0.172516,-0.410017,-0.202595,-0.430271,1.076396,0.70198,-0.018449,0.0,0.0,0.0,0.0
3,-0.157134,-0.172516,-0.410017,-0.202595,-0.430271,1.421463,0.705524,-0.018449,0.0,0.0,0.0,0.0
4,-0.157134,-0.172516,-0.410017,-0.202595,2.054642,1.004629,0.731018,-0.018449,0.0,0.0,0.0,0.0
5,-0.157134,-0.172516,-0.410017,-0.202595,2.054642,1.224601,0.745065,-0.018449,0.0,0.0,0.0,0.0
6,-0.157134,-0.172516,-0.410017,-0.202595,2.054642,1.359841,0.813444,-0.018449,0.0,0.0,0.0,0.0
7,-0.157134,-0.172516,-0.410017,-0.202595,-0.430271,1.516572,0.935628,-0.018449,0.0,0.0,0.0,0.0
8,-0.157134,-0.172516,-0.410017,-0.202595,2.054642,-0.056925,0.91558,-0.018449,0.0,0.0,0.0,0.0
9,-0.157134,-0.172516,-0.410017,-0.202595,-0.430271,-1.22583,0.860024,-0.018449,0.0,0.0,0.0,0.0


In [7]:
df_transformed.drop(columns= ['new_devices_detected'] , inplace = True)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from imblearn.over_sampling import SMOTE

# Train the OneClassSVM model on preprocessed one-class training data
model = OneClassSVM(nu=0.01, gamma=0.1)
model.fit(df_transformed)

# Load and preprocess the test data
df_test = pd.read_csv(r"C:\Users\User\OneDrive\Desktop\ME\GradProj\IDS_data\test_combined.csv")

# Apply feature engineering functions
df_test = create_idle_time(df_test)
df_test = handle_shortcuts(df_test)
df_test = extract_mousejack_features(df_test)

# Save and remove label column
label = df_test['label']
df_test = df_test.drop(columns=['label'])

# Apply preprocessing
df_test = preprocess_data(df_test)

# Drop any extra columns not present in training
if 'new_devices_detected' in df_test.columns:
    df_test = df_test.drop(columns=['new_devices_detected'])

# Merge features and labels
df_test['label'] = label
X_test = df_test.drop(columns=['label'])
y_test = df_test['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_test, y_test)

# Predict using OneClassSVM
raw_preds = model.predict(X_balanced)
final_preds = np.where(raw_preds == 1, 0, 1)  # 1 -> normal -> 0, -1 -> outlier -> 1

# Ensure label and predictions are aligned
y_balanced = y_balanced.reset_index(drop=True)
final_preds = pd.Series(final_preds).reset_index(drop=True)

# Evaluate the model
print("Accuracy:", accuracy_score(y_balanced, final_preds))
print("Confusion Matrix:\n", confusion_matrix(y_balanced, final_preds))
print("Classification Report:\n", classification_report(y_balanced, final_preds, target_names=["Normal", "Malicious"]))

Accuracy: 0.7304250559284117
Confusion Matrix:
 [[1270   71]
 [ 652  689]]
Classification Report:
               precision    recall  f1-score   support

      Normal       0.66      0.95      0.78      1341
   Malicious       0.91      0.51      0.66      1341

    accuracy                           0.73      2682
   macro avg       0.78      0.73      0.72      2682
weighted avg       0.78      0.73      0.72      2682



In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

model = OneClassSVM(nu=0.01, gamma=0.1)
model.fit(df_transformed)

# Load and preprocess the test data
df_test = pd.read_csv(r"C:\Users\User\OneDrive\Desktop\ME\GradProj\IDS_data\test_combined.csv")

# Apply feature engineering functions
df_test = create_idle_time(df_test)
df_test = handle_shortcuts(df_test)
df_test = extract_mousejack_features(df_test)

# Save and remove label column
label = df_test['label']
df_test = df_test.drop(columns=['label'])

# Apply preprocessing
df_test = preprocess_data(df_test)

# Drop any extra columns not present in training
if 'new_devices_detected' in df_test.columns:
    df_test = df_test.drop(columns=['new_devices_detected'])

# Make predictions
raw_preds = model.predict(df_test)
final_preds = np.where(raw_preds == 1, 0, 1)  # Convert: 1 -> 0 (normal), -1 -> 1 (malicious)

# Ensure label and predictions are aligned
label = label.reset_index(drop=True)
final_preds = pd.Series(final_preds).reset_index(drop=True)

# Evaluate the model
print("Accuracy:", accuracy_score(label, final_preds))
print("Confusion Matrix:\n", confusion_matrix(label, final_preds))
print("Classification Report:\n", classification_report(label, final_preds, target_names=["Normal", "Malicious"]))

Accuracy: 0.900198281559815
Confusion Matrix:
 [[1270   71]
 [  80   92]]
Classification Report:
               precision    recall  f1-score   support

      Normal       0.94      0.95      0.94      1341
   Malicious       0.56      0.53      0.55       172

    accuracy                           0.90      1513
   macro avg       0.75      0.74      0.75      1513
weighted avg       0.90      0.90      0.90      1513

