# VirusTotal Retrohunt Stats
*This notebook uses the VirusTotal API to lookup data on file hashes and create plotly graphs*
### Part 1 - Fetching Data from VT 🌐

In [3]:
import requests, humanize, re, os
import ipywidgets as widgets
import plotly.express as px
from tqdm.notebook import tqdm

# Set the VirusTotal API key
api_key = os.getenv('VT_API') 

# counter for API requests
api_requests = 0

max_api_requests = widgets.IntText(value=100, description='Max VT API Requests:',  style= {'description_width': 'initial'})

hash_input = widgets.Textarea(placeholder="Enter your SHA256 hashes here")
hash_count_label = widgets.Label()

def extract_hashes(text):
    sha256_regex = re.compile(r"[a-fA-F0-9]{64}")
    sha256_hashes = sha256_regex.findall(text)
    sha256_count = len(sha256_hashes)
    hash_count_label.value = f"Number of SHA256 Hashes: {sha256_count}"
    return sha256_hashes

hash_input.observe(lambda change: extract_hashes(change["new"]), names="value")

def on_start_button_clicked(b):
    global api_requests
    global hashes, json_results, detection_percentages
    api_requests = 0

    # Extract the SHA256 hashes from the text input
    extracted_hashes = extract_hashes(hash_input.value.strip())
    if not extracted_hashes:
        print("No hashes found")
        return
    # Assign the extracted hashes to the sha1_hashes variable
    hashes = extracted_hashes
    if not hashes:
        print("No hashes entered")
        return

    detection_percentages = []
    json_results = []
    for file_hash in tqdm(hashes, desc="Requesting data from VT"):
        # check if the maximum number of API requests has been reached
        if api_requests >= max_api_requests.value:
            break
        url = "https://www.virustotal.com/api/v3/files/" + file_hash
        headers = {"accept": "application/json", "x-apikey": api_key}
        response = requests.get(url, headers=headers)
        api_requests += 1

        if response.status_code == 200:

            data = response.json()
            json_results.append(data)
            malicious_detections = data["data"]["attributes"]["last_analysis_stats"][
                "malicious"
            ]
            total_detections = sum(
                data["data"]["attributes"]["last_analysis_stats"].values()
            )
            detection_percentages.append(100 * malicious_detections / total_detections)
    print(f"Number of API requests sent: {api_requests}")

# Create a button to start the API requests
start_button = widgets.Button(description="Start")
start_button.on_click(on_start_button_clicked)

# Display the widgets
display(hash_input, hash_count_label, max_api_requests, start_button)


Textarea(value='', placeholder='Enter your SHA256 hashes here')

Label(value='')

IntText(value=100, description='Max VT API Requests:', style=DescriptionStyle(description_width='initial'))

Button(description='Start', style=ButtonStyle())

HBox(children=(FloatProgress(value=0.0, description='Requesting data from VT', max=55.0, style=ProgressStyle(d…


Number of API requests sent: 50


### Part 2 - Run Data Visualization 📊
This code section creates:
- Scatter graph of detection ratio to filesizes
- Bar graph of VT upload times
- Pie chart of file types

In [4]:
import plotly.graph_objs as go
from datetime import datetime
import pandas as pd


# Get human readable file sizes
filesizes = [x["data"]["attributes"]["size"] for x in json_results]
filesizes_hr = [humanize.naturalsize(x) for x in filesizes]

detection_ratios = [x for x in detection_percentages]

# Sort the filesizes, filesizes_hr, and detection_ratios lists by filesizes
filesizes, filesizes_hr, detection_ratios = zip(*sorted(zip(filesizes, filesizes_hr, detection_ratios)))

# Create the scatter plot
fig = px.scatter(
    y=filesizes_hr,
    x=detection_ratios,
    color=detection_ratios,
    title="Distribution of Detection Ratios",
    labels={"y": "File Size (bytes)", "x": "Detection Ratio (%)"},
    color_continuous_scale="Temps",
    template="plotly_dark"
)

fig.show()

# Extract the upload dates from the json_results list
upload_dates = [x["data"]["attributes"]["first_submission_date"] for x in json_results]

import datetime

# Convert UNIX timestamps to datetime objects
datetime_objects = [datetime.datetime.fromtimestamp(x["data"]["attributes"]["first_submission_date"]) for x in json_results]

# create the histogram
fig2 = go.Figure()
fig2.add_trace(go.Histogram(x=datetime_objects))
fig2.update_layout(xaxis_type="date",template="plotly_dark",title="File Upload Dates on VirusTotal")
fig2.show()

# create a new DataFrame with the relevant data
df = pd.DataFrame(json_results)
df = df['data'].apply(pd.Series)
df = df['attributes'].apply(pd.Series)
df = df.loc[:,['type_description']]

# calculate the count of files by type_description
counts = df['type_description'].value_counts()

# create the graph
data = [go.Pie(labels=counts.index, values=counts.values)]
layout = go.Layout(title='Distribution of file types',  template="plotly_dark")
fig3 = go.Figure(data=data, layout=layout)

fig3.show()



### Part 3 - Other Stats & Data 🧪
This code section creates:
- A table of matched sigma rules from VirusTotal

In [5]:
rule_counts = {}
for item in json_results:
    if "sigma_analysis_results" in item["data"]["attributes"]:
        rule_title = item["data"]["attributes"]["sigma_analysis_results"][0]["rule_title"]
        if rule_title in rule_counts:
            rule_counts[rule_title] += 1
        else:
            rule_counts[rule_title] = 1
    else:
        pass

if len(rule_counts) == 0:
    print("No sigma rule matches!")
else:
    rule_counts_df = pd.DataFrame(rule_counts.items(), columns={'Rule Title','Count'})

    display(rule_counts_df)

Unnamed: 0,Count,Rule Title
0,Creation of an Executable by an Executable,19
1,Windows Cmd Delete File,4
