In [None]:
# Required imports
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Data loading and preprocessing
# Assuming you have your 'data_df' dataset ready for processing
# Make sure data_df has columns: 'commands' for command text and 'labels' for classification labels
# Example of tokenization and further processing

def tokenize_shell_command(cmd):
    # Split the shell command by space while keeping special characters intact
    tokens = cmd.split()
    return tokens

# Tokenizing the commands
data_df['tokenized_commands'] = data_df['commands'].apply(tokenize_shell_command)


In [None]:
# Step 1: Train Word2Vec model on tokenized shell commands
w2v_model = Word2Vec(sentences=data_df['tokenized_commands'], vector_size=100, window=5, min_count=1, workers=4)

# Step 2: Create embedding for each command (average of word vectors)
def get_command_embedding(command):
    vectors = [w2v_model.wv[word] for word in command if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

# Apply the embedding to the dataset
X_word2vec = np.array([get_command_embedding(cmd) for cmd in data_df['tokenized_commands']])

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, data_df['labels'], test_size=0.2, random_state=42)

# Step 4: XGBoost classification
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Step 5: Prediction and accuracy
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification accuracy: {accuracy:.4f}')
