# Malware Detection System - Analysis Report

Performance analysis report for the Malware Detection System.
This document includes:
1. Data Statistics.
2. Model Evaluation (Confusion Matrix).
3. Feature Importance Analysis.

In [None]:
import os
import sys
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Configure inline plotting
%matplotlib inline

# Add src directory to path to import custom modules
sys.path.append(os.path.abspath("src"))

from static_analyzer import StaticAnalyzer
from ml_classifier import MLMalwareClassifier

# Setup paths
BASE_DIR = os.path.expanduser('~/malware_detection')
CONFIG_PATH = os.path.join(BASE_DIR, 'config.json')
MODEL_PATH = os.path.join(BASE_DIR, 'model_full.pkl')
DATASET_DIR = os.path.join(BASE_DIR, 'dataset')

print("‚úÖ Environment setup successful.")

In [None]:
# Check model
if not os.path.exists(MODEL_PATH):
    print(f"‚ùå Model not found at {MODEL_PATH}. Please run train_ml.py first!")
else:
    # Load Config
    with open(CONFIG_PATH) as f:
        config = json.load(f)
    
    # Load Model Pipeline
    # Note: This classifier contains Vectorizer, Scaler, and Model (XGBoost/RandomForest)
    classifier = joblib.load(MODEL_PATH)
    analyzer = StaticAnalyzer(config)
    
    print(f"‚úÖ Model loaded: {classifier.model_name}")
    print(f"‚úÖ Static Analyzer configuration loaded.")

In [None]:
def load_test_data(dataset_dir, analyzer, classifier):
    y_true = []
    y_pred = []
    y_conf = []
    
    print("üîÑ Scanning data and predicting...")
    
    # Iterate through folders (Malware=1, Benign=0)
    for category, label in [('malware', 1), ('benign', 0)]:
        folder = os.path.join(dataset_dir, category)
        if not os.path.exists(folder):
            continue
            
        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
        
        for i, fname in enumerate(files):
            try:
                file_path = os.path.join(folder, fname)
                
                # 1. Static Analysis
                analysis = analyzer.analyze_file(file_path)
                
                # 2. Model Prediction
                result = classifier.predict(analysis)
                
                y_true.append(label)
                y_pred.append(result['prediction'])
                y_conf.append(result['confidence'])
                
            except Exception as e:
                print(f"‚ö†Ô∏è Error file {fname}: {e}")
                
    print(f"‚úÖ Processed {len(y_true)} samples.")
    return y_true, y_pred, y_conf

# Execute data loading
if os.path.exists(MODEL_PATH):
    y_true, y_pred, y_conf = load_test_data(DATASET_DIR, analyzer, classifier)

In [None]:
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Benign', 'Malware'], 
                yticklabels=['Benign', 'Malware'])
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14)
    plt.ylabel('Actual Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.show()

    # Print detailed text report
    print("\nüìã Classification Report:\n")
    print(classification_report(y_true, y_pred, target_names=['Benign', 'Malware']))

plot_confusion_matrix(y_true, y_pred, classifier.model_name)

In [None]:
def plot_feature_importance(pipeline_classifier):
    model = pipeline_classifier.model
    vectorizer = pipeline_classifier.vectorizer
    
    # 1. Get feature names
    # Numerical features (order matches _extract_features in ml_classifier.py)
    numeric_features = ['Entropy', 'Is PE', 'Suspicious Imports', 'YARA Matches']
    
    # Text features (from TF-IDF Vectorizer)
    try:
        text_features = vectorizer.get_feature_names_out()
    except AttributeError:
        text_features = vectorizer.get_feature_names() # For older scikit-learn versions
        
    all_feature_names = numeric_features + list(text_features)
    
    # 2. Get Feature Importances
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        
        # Sort descending
        indices = np.argsort(importances)[::-1]
        
        # Get Top 15 most important features
        top_n = 15
        top_indices = indices[:top_n]
        top_importances = importances[top_indices]
        top_names = [all_feature_names[i] for i in top_indices]
        
        # Plot
        plt.figure(figsize=(12, 6))
        sns.barplot(x=top_importances, y=top_names, palette='viridis')
        plt.title(f'Top {top_n} Feature Importance ({pipeline_classifier.model_name})', fontsize=14)
        plt.xlabel('Importance Score')
        plt.ylabel('Feature')
        plt.show()
    else:
        print("This model does not support direct Feature Importance extraction.")

plot_feature_importance(classifier)