In [2]:
! pip install pefile

Collecting pefile
  Downloading pefile-2024.8.26-py3-none-any.whl.metadata (1.4 kB)
Downloading pefile-2024.8.26-py3-none-any.whl (74 kB)
   ---------------------------------------- 0.0/74.8 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/74.8 kB ? eta -:--:--
   ---------------------------------------- 74.8/74.8 kB 827.3 kB/s eta 0:00:00
Installing collected packages: pefile
Successfully installed pefile-2024.8.26



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import os
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Dave Sisk\Repos\vector-search-with-security-logs


In [10]:
import os
import pefile
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Function to read a text file into a list
def read_file_to_list(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.read().splitlines()  # Read lines and remove newline characters
        return lines
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

# Get the full list from a pre-created text file
file_path = ".\\vector-search-with-security-logs\\exe-file-list.txt"  # Replace with the path to your text file
executables = read_file_to_list(file_path)

# Function to extract PE file features
def extract_pe_features(file_path):
    try:
        pe = pefile.PE(file_path)
        features = {
            "File": os.path.basename(file_path),
            "NumberOfSections": len(pe.sections),
            "EntryPoint": pe.OPTIONAL_HEADER.AddressOfEntryPoint,
            "ImageBase": pe.OPTIONAL_HEADER.ImageBase,
            "Subsystem": pe.OPTIONAL_HEADER.Subsystem,
            "DllCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
            # Ensure ImportedDLLs is always present, even if empty
            "ImportedDLLs": [entry.dll.decode('utf-8') for entry in pe.DIRECTORY_ENTRY_IMPORT] if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT') else []
        }
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Extract features for all executables
data = []
for exe in executables:
    features = extract_pe_features(exe)
    if features:
        data.append(features)

# Create a pandas DataFrame
df = pd.DataFrame(data)

# Ensure ImportedDLLs column exists and handle missing values
if 'ImportedDLLs' not in df.columns:
    df['ImportedDLLs'] = ''
else:
    df['ImportedDLLs'] = df['ImportedDLLs'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Load the Hugging Face sentence-transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings using the model
df['Embedding'] = df['ImportedDLLs'].apply(lambda x: model.encode(x))

# Example: Perform a similarity search
query_vector = df['Embedding'][0]  # Use the first executable as the query
similarities = df['Embedding'].apply(lambda x: cosine_similarity([query_vector], [x]).flatten()[0])

# Add similarity scores to the DataFrame
df['Similarity'] = similarities

# Display the DataFrame
print(df)



                                  File  NumberOfSections  EntryPoint  \
0    agentactivationruntimestarter.exe                 5        7056   
1                     AgentService.exe                 6      779616   
2                   AggregatorHost.exe                 6      208768   
3                        aitstatic.exe                 7      177440   
4                              alg.exe                 7       56192   
..                                 ...               ...         ...   
650                           wusa.exe                 6      122608   
651                        WWAHost.exe                 7      219088   
652                XblGameSaveTask.exe                 6       19840   
653                          xcopy.exe                 6       21104   
654                        xwizard.exe                 6       20480   

      ImageBase  Subsystem  DllCharacteristics  \
0    5368709120          2               49504   
1    5368709120          2         