#### Importing Libraries

In [1]:
# Import necessary libraries
import os
import json
import requests
import pandas as pd

#### Setting up Paths and API Keys

In [2]:
# Establish base and target directories for storing results
base_path = os.path.dirname(os.path.realpath('__file__'))
results_path = os.path.join(base_path, 'virustotal_behaviour')

# List of API keys for accessing the VirusTotal API
API_KEYS = [
  'e865dc68f3a3459c08e97aea5bf20347c757261a1346101b8116cb72b691ff65',
  'ca6dd5a9b76a2b274d7750f24942690539c957847b071d20f57033ffb5010edd'
]

API_KEY = API_KEYS[1]

#### Data Preparation

In [3]:
# Load data from 'overview.csv' and filter out rows where 'Filetype' column is not null
df_apt = pd.read_csv('overview.csv')
df = df_apt[df_apt.Filetype.notna()]

df

Unnamed: 0,ID,Country,APT-group,Family,Status,MD5,SHA1,SHA256,Source,Filetype
0,1,China,APT 1,,V,001dd76872d80801692ff942308c64e6,7f4d6745b9053583b55b87bb16a88840e56e5621,b6bc96ffcbdbf22f908a02b8fe2a392c8e5b8420a12093...,https://www.mandiant.com/sites/default/files/2...,Win32 EXE
1,2,China,APT 1,,V,002325a0a67fded0381b5648d7fe9b8e,ee4c025731e791fb358f5f03e9d95fc86ee0a723,3bff207897f6d8cd8f8e178a565d5efdd7d65c6bc27063...,https://www.mandiant.com/sites/default/files/2...,Win32 EXE
2,3,China,APT 1,,V,00dbb9e1c09dbdafb360f3163ba5a3de,8f096561d4021978f781cc3978a55d0f621fc837,df649cef1505653a2fc4361f1e2e34f7148b60c00e1cf6...,https://www.mandiant.com/sites/default/files/2...,Win32 EXE
6,7,China,APT 1,,V,0149b7bd7218aab4e257d28469fddb0d,f89bf65f696f27c0504882558d40c8c7e868e269,13e40ee7c6874e2f1ed58bc09738a5525f86361f1a8538...,https://www.mandiant.com/sites/default/files/2...,Win32 EXE
8,9,China,APT 1,,V,01e0dc079d4e33d8edd050c4900818da,cff0b920bb2aeda46f8635936d1a5119b681f9a5,3eaa53ec7fe5fb610bb9a0dd3d0b93480869b829aac373...,https://www.mandiant.com/sites/default/files/2...,Win32 EXE
...,...,...,...,...,...,...,...,...,...,...
4443,4445,China,Winnti,,V,f4c9bc4f045b90c496df4b75398dfa5c,4941e08d30e987104e5508f2c983054b6779ee51,0d2c333e089fd28fda0060e8c1ab910e4a91d4225823e3...,https://media.kasperskycontenthub.com/wp-conte...,Win32 DLL
4444,4446,China,Winnti,,V,04f3fbaaaf5026df29e0d7d317194043,7dccaaefddd0773fdf6b09c0853e11a0414d79f1,c3bb9d1f748d0b1b78dc525039c7a2ebab610a4f0ab7f4...,https://media.kasperskycontenthub.com/wp-conte...,Win32 EXE
4445,4447,China,Winnti,,V,07e40089cdf338e8d1423b3d97332a4d,fdb350809a06501f8bd5917560f2891fd58f9714,2c7321d8b9905d4b8671c9e16c14665cd29f88460f0c8e...,https://media.kasperskycontenthub.com/wp-conte...,Win32 EXE
4446,4448,China,Winnti,,V,0b105cd6ecdfe5724c7db52135aa47ef,871138ae639830fa75b094767e1787e14cf19eac,c465238c9da9c5ea5994fe9faf1b5835767210132db0ce...,https://media.kasperskycontenthub.com/wp-conte...,Win32 EXE


In [4]:
# Check for missing values in each column of the DataFrame
df.isna().sum()

ID              0
Country         0
APT-group       0
Family       1180
Status          0
MD5             0
SHA1            0
SHA256          0
Source          0
Filetype        0
dtype: int64

In [5]:
# Count the number of files by their type
df.Filetype.value_counts()

Filetype
Win32 EXE                      1841
Win32 DLL                      1024
Rich Text Format                402
Office Open XML Document        192
Office Open XML Spreadsheet      30
MS Excel Spreadsheet             23
PDF                              19
RAR                              13
unknown                          10
MS Word Document                 10
ZIP                               7
ELF                               4
Powershell                        3
Mach-O                            3
VBA                               2
DOS EXE                           2
Flash                             2
JavaScript                        1
Windows shortcut                  1
Outlook                           1
MS PowerPoint Presentation        1
PostScript                        1
JAR                               1
Name: count, dtype: int64

------

#### List all the reports obtained

In [6]:
# List all the reports obtained by checking files in the result directory
obtained_reports = set()

# Traverse through the directory, adding report names to a set
for folder_name in os.listdir(results_path):
  folder_path = os.path.join(results_path, folder_name)
  if not os.path.isdir(folder_path): continue

  for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    if file_name.endswith('.json'):
      obtained_reports.add(file_name.split('.')[0])

# Display the total number of unique reports obtained
len(obtained_reports)

3593

----

### Download reports

In [7]:
# Download reports for new files not previously fetched
df_filtered = df[(~df['SHA256'].isin(obtained_reports)) & (~df['MD5'].isin(obtained_reports))]

# Display the DataFrame of filtered results
df_filtered

Unnamed: 0,ID,Country,APT-group,Family,Status,MD5,SHA1,SHA256,Source,Filetype


In [8]:

# Iterate over filtered DataFrame rows to download missing reports
for row in df_filtered.iterrows():
  folder_name = row[1]['APT-group']
  file_name = row[1]['MD5'] if pd.isna(row[1]['SHA256']) else row[1]['SHA256']
  
  print(file_name)
  url = "https://www.virustotal.com/api/v3/files/{id}/behaviour_summary".format(id=file_name)

  headers = {
    "accept": "application/json", 
    'x-apikey': API_KEY
  }

  # Send HTTP GET request to fetch the report
  response = requests.get(url, headers=headers)
  
  folder_path = os.path.join(results_path, folder_name)
  if not os.path.exists(folder_path):
      os.makedirs(folder_path)
      
  # Handle missing files or API errors by logging them
  if response.status_code == 404:
    with open(os.path.join(results_path, 'error.json')) as f:
      data = json.load(f)
    if 'not_found' not in data: data['not_found'] = []
    if folder_name not in data['not_found']: data['not_found'].append(folder_name)
    with open(os.path.join(results_path, 'error.json'), 'w') as f:
      json.dump(data, f, indent=4)

  else:
    if 'error' in response.json(): 
      print(response.json())
      break
    
    file_path = os.path.join(folder_path, file_name+'.json')
    with open(file_path, 'w') as f:
      json.dump(response.json(), f , indent=4)

------

#### Remove empty files

In [9]:
# Remove empty or erroneous files after checking their contents.
for folder_name in os.listdir(results_path):
  folder_path = os.path.join(results_path, folder_name)
  if not os.path.isdir(folder_path): continue

  for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    with open(file_path) as f:
      data = json.load(f)
    
    if 'error' in data:
      os.remove(file_path)