In [17]:
#Import libraries for data handling, ML model training, and evaluation.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation


In [18]:
#Generate a synthetic dataset simulating PE file features for machine learning to distinguish malware from benign software.
n_samples = 5000      
n_byte_features = 256 
n_sections = 5        

#creates synthetic PE sections for experiments without needing real PE files
def random_sections(n):
    sections = []
    for _ in range(n):
        sections.append({
            'Entropy': np.random.rand(),
            'VirtualSize': np.random.randint(1000, 100000),
            'RawSize': np.random.randint(1000, 100000)
        })
    return sections
    
#synthetic features to mimic real PE files for training/testing malware detection models.
data = []
for _ in range(n_samples):
    sample = {
        'SizeOfHeaders': np.random.randint(200, 1000),
        'MajorLinkerVersion': np.random.randint(1, 10),
        'MinorLinkerVersion': np.random.randint(0, 10),
        'Characteristics': np.random.randint(0, 65535),
        'Machine': np.random.choice([332, 34404]),
        'num_imported_dlls': np.random.randint(0, 20),
        'num_imported_functions': np.random.randint(0, 200),
        'num_exported_functions': np.random.randint(0, 50),
        'sections': random_sections(np.random.randint(1, n_sections+1)),
        'byte_histogram': np.random.randint(0, 1000, size=n_byte_features).tolist()
    }
    data.append(sample)

df = pd.DataFrame(data)


In [19]:
#To generate binary labels that simulate malware vs. benign classification based on key PE file features.
threshold = 4000 # heuristic threshold to divide random data into two classes
df['label'] = (
    df['SizeOfHeaders'] +
    df['byte_histogram'].apply(lambda x: sum(x[:10])) +
    df['num_imported_functions']*10
    > threshold
).astype(int)

print(f"Label distribution:\n{df['label'].value_counts()}")


Label distribution:
label
1    4959
0      41
Name: count, dtype: int64


Transforms raw/synthetic PE file metadata into a fixed-length, machine-readable feature vector 
combining structural features (headers, sections) and statistical features (byte histogram).

In [20]:


#Copy relevant PE header fields (already generated in synthetic data) into clean feature columns for ML
def extract_features(df):
    df['header_size'] = df['SizeOfHeaders']
    df['major_linker'] = df['MajorLinkerVersion']
    df['minor_linker'] = df['MinorLinkerVersion']
    df['characteristics'] = df['Characteristics']
    df['machine'] = df['Machine']
    
    df['num_imported_functions'] = df['num_imported_functions']
    df['num_exported_functions'] = df['num_exported_functions']
    
    def section_features(sections):
        if not isinstance(sections, list) or len(sections) == 0:
            return pd.Series([0, 0, 0])
        entropies = [s.get('Entropy',0) for s in sections]
        vsize = [s.get('VirtualSize',0) for s in sections]
        rsize = [s.get('RawSize',0) for s in sections]
        return pd.Series([np.mean(entropies), np.sum(vsize), np.sum(rsize)])
    
    df[['avg_section_entropy','total_virtual_size','total_raw_size']] = df['sections'].apply(section_features)
    
    bh_df = pd.DataFrame(df['byte_histogram'].tolist(), columns=[f'bh_{i}' for i in range(256)])
    
    features = pd.concat([
        df[['header_size','major_linker','minor_linker','characteristics','machine',
            'num_imported_functions','num_exported_functions',
            'avg_section_entropy','total_virtual_size','total_raw_size']],
        bh_df
    ], axis=1).fillna(0)
    
    return features

X = extract_features(df) #Turn raw/synthetic PE file info into numeric features
y = df['label'] #target labels (0 = benign, 1 = malware)

print(f"Feature matrix shape: {X.shape}")  # ~270+ features
print(f"Labels shape: {y.shape}")


Feature matrix shape: (5000, 266)
Labels shape: (5000,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
) # 20% testing and 80% training set

print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")


Train samples: 4000, Test samples: 1000


In [22]:
# This block of code is training a LightGBM binary classifier with given hyperparameters,
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42
}
#using training and validation datasets, running up to 500 boosting rounds but stopping early if validation performance doesnâ€™t improve for 20 rounds
model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, valid_data],
    valid_names=['train','valid'],
    callbacks=[early_stopping(stopping_rounds=20), log_evaluation(period=50)]
)


Training until validation scores don't improve for 20 rounds
[50]	train's auc: 1	valid's auc: 0.935862
Early stopping, best iteration is:
[65]	train's auc: 1	valid's auc: 0.956023


In [23]:
y_pred_prob = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = (y_pred_prob >= 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.9920
ROC AUC: 0.9560

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.99      1.00      1.00       992

    accuracy                           0.99      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.98      0.99      0.99      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
