In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

-----

### Extract and Prepare Task ID from HTML Data

Scrap the HTML data to extract the necessary information to prepare the task ID.

In [2]:
# Open and read the HTML content from a file named 'summary_page.txt'
with open('summary_page.txt') as file:
    # Join all lines of the file into a single string, with newlines preserved
    html_content = "\n".join(file.readlines())

# Parse the HTML content using BeautifulSoup
# This allows us to navigate the HTML structure and extract the required data
soup = BeautifulSoup(html_content, 'html.parser')

# Find all rows (<tr> elements) within the table body (<tbody>)
# These rows contain the task information we're interested in
rows = soup.find('tbody').find_all('tr')

# Initialize empty lists to store extracted data: task IDs, filenames, and packages
task_ids = []
filenames = []
packages = []

# Loop through each row to extract the task id, filename, and package
for row in rows:
    # Extract the task ID from the 'data-task-id' attribute of the row
    task_id = row.get('data-task-id')
    
    # Extract the filename from the third <td> element in the row
    filename = row.find_all('td')[2].text.strip()
    
    # Extract the package type from the fourth <td> element in the row
    package = row.find_all('td')[3].text.strip()
    
    # Append the extracted values to the respective lists
    task_ids.append(task_id)
    filenames.append(filename)
    packages.append(package)

# Create a DataFrame from the lists, with columns for Task ID, Filename, and Package
df = pd.DataFrame({
    'Task ID': task_ids,
    'Filename': filenames,
    'Package': packages
})

# Filter the DataFrame to exclude rows where the Package is '7z'
# This removes unwanted rows from the DataFrame
df = df[df.Package != '7z']

# Output the filtered DataFrame
df

AttributeError: 'NoneType' object has no attribute 'find_all'

Testing for validity

In [None]:
# Ensure all filenames contain '@' and are unique
assert df.Filename.apply(lambda x: int('@' in x)).sum() == df.shape[0]
assert df.Filename.duplicated().sum() == 0

In [None]:
df[df.Filename.duplicated(keep=False)].sort_values('Filename')

In [None]:
df.drop_duplicates(subset='Filename', inplace=True, keep='first')

In [None]:
df

Data Cleaning

In [None]:
# Create a copy of the DataFrame for further modifications
df_clean = df.copy()

# Clean up the 'Filename' column by removing everything after and including the '@'
df_clean['Filename'] = df_clean['Filename'].apply(lambda x: x if '@' not in x else x.split('@')[1].split('.')[0].strip() + '.zip')

# Convert the 'Task ID' column from string to integer type
df_clean['Task ID'] = df_clean['Task ID'].astype(int)

df_clean

In [None]:
# Load malware index and update with Task ID
df_malware = pd.read_csv('malware.csv')

for filename in df_clean['Filename']:
    if df_malware[df_malware.file == filename].shape[0] != 1:
        print(f"Filename {filename} not found in malware.csv")
        continue
    
    df_malware.loc[df_malware['file'] == filename, 'cuckoo_id'] = df_clean.loc[df_clean['Filename'] == filename, 'Task ID'].values[0]

print("The number of empty cuckoo_id is", df_malware['cuckoo_id'].isnull().sum(), "\n\n")
df_malware

In [None]:
# Check for any extra files in the malware index
df_clean[~df_clean['Filename'].isin(df_malware['file'])]

In [None]:
# Update the records
df_malware.to_csv('malware.csv', index=False)

-----------

### Filtering only the missing APT to Download reports

In [None]:
# Load the malware index CSV
df = pd.read_csv('malware.csv')

# Filter records with a non-null cuckoo_id, sort by cuckoo_id, and convert it to int
filtered = df[df['cuckoo_id'].notnull()].sort_values(by='cuckoo_id')
filtered['cuckoo_id'] = filtered['cuckoo_id'].astype(int)

# Print the count of samples with cuckoo_id and remaining samples per APT group
print("The number of samples with cuckoo_id is: ", len(filtered))
print("The number of remaining samples per APT group is: ", df.shape[0] - filtered.shape[0])

# Check for and print the number of duplicates in cuckoo_id
print('Duplicates: ', filtered.duplicated(subset=['cuckoo_id']).sum(), '\n\n')

# Display the count of missing cuckoo_id values per APT group
print("------ Missing -------")
display(df[df['cuckoo_id'].isnull()].apt.value_counts().sort_index())

In [None]:
apt_group = 'Gorgon Group'
missing = df[df.apt == apt_group]
missing[missing.cuckoo_id.isnull()].file.values

In [None]:
import shutil

# remove the missing directory with file
if os.path.exists('missing'):
  shutil.rmtree('missing')

# create the missing directory
os.mkdir('missing')

In [None]:
malware_path = os.path.join(os.path.split(os.path.abspath('.'))[0], 'APTMalware', 'samples', apt_group)
# copy missing files
for file in missing[missing.cuckoo_id.isnull()].file.values:
    dest_path = os.path.join(os.path.abspath('.'), 'missing', )
    file_path = os.path.join(malware_path, file)
    os.system(f"cp '{file_path}' '{dest_path}'")

In [None]:
# Get the absolute path of the current directory
base_path = os.path.abspath('.')

# Initialize a list to store downloaded Cuckoo task IDs
downloaded_cuckoo_task = []

# Loop through files and folders in the 'cuckoo' directory
for filename in os.listdir(os.path.join(base_path, 'cuckoo')):
    # Check if the item is a folder and it contains files
    if os.path.isdir(os.path.join(base_path, 'cuckoo', filename)) and len(os.listdir(os.path.join(base_path, 'cuckoo', filename))) > 0:
        # If so, add the folder name to the list of downloaded tasks
        downloaded_cuckoo_task.append(filename)
    else:
        print(f"Folder/File {filename} is empty")

# Filter the DataFrame to find tasks that have a cuckoo_id but haven't been downloaded yet
df_to_download = filtered[~filtered.cuckoo_id.apply(lambda x: str(x)).isin(downloaded_cuckoo_task)]

# Save these task IDs to a CSV file
df_to_download.to_csv('cuckoo_task_ids_to_download.csv', index=False)

# Display the DataFrame of tasks to be downloaded
df_to_download

In [None]:
extra_files = [x for x in downloaded_cuckoo_task if x not in filtered.cuckoo_id.apply(lambda x: str(x)).values]

extra_files