## Ransomware family labelling using AVClass

In this script, we outline the procedure used in labelling the samples in Maraudermap into their respective ransomware families. 

In [9]:
# importing libraries
import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm
import hashlib
import json
import avclass
import subprocess
import csv
import re

### Converting VirusTotal (VT) JSON files to JSONL suitable for AVClass

In [2]:
def convert_json_to_jsonl_with_json_extension(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r') as f:
        data = json.load(f)

    # Handle different structures
    if isinstance(data, list):
        # Case 1: JSON Array of Objects
        json_objects = data
    elif isinstance(data, dict):
        # Case 2: Single JSON Object
        # If you want to convert a single object to a JSONL format
        json_objects = [data]
    else:
        raise ValueError("Unsupported JSON structure")

    # Write to JSONL file with .json extension
    with open(output_file, 'w') as f:
        for item in json_objects:
            json_line = json.dumps(item)
            f.write(json_line + '\n')

In [3]:
# Define input and output folders
input_folder = 'vt_reports_ms'
output_folder = 'vt_reports_ms_avclass'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate through each file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.json'):
        input_file_path = os.path.join(input_folder, filename)
        output_file_path = os.path.join(output_folder, filename)
        convert_json_to_jsonl_with_json_extension(input_file_path, output_file_path)
        print(f'Converted {filename}')

Converted 54d760b47de0204e3af3305993ddeb964a99fe24901799c05bb0b43d0835727f.json
Converted bc60a3ff601aa5c429c18ceeacd9b8a5bbdfb7b15513564f140326232851bdda.json
Converted 3b9a58b649dbfccfc04fe77b8b1da0b48e493dde87f7b5943b706cb9e6af7825.json
Converted 73144d0cb1a5d491356aafd30a757a4f33ef9e0f52e093828f37a07ea11976a5.json
Converted 95ce618a4a3cb57c8b935ca7fc0dac03251651aed97e5ff7dd8dd0efea4b786b.json
Converted fc31282f8ff4cf240ad9ee4b2ccaf91c34c7a17ed4dc39384f52c6d57e36aaeb.json
Converted 57f40bcce0cf935db322a72140d5dbf9bd0c520679d8377a89e2c76fe2aa8664.json
Converted 7b7c08182e5fa474d2e6c3391550fb38140d5454cbb4af84ede0846a9b9ffba5.json
Converted f8b2e41981e74cbaccbbba088d24093c8cc92d3f1fb602b5066155f3af762fe6.json
Converted 71456644f84dbb3d0a3747736b5b989c414386a9083bb0331aeaf13158a05b29.json
Converted 024f65ec84ab16ec7ee1eb93c65617a3d56b7af65de009d0c2a9df55431f73b6.json
Converted 43e007fcfc1fb402c7dfb5f4db2bf99ef9d040445582af618cf31b0b11f61039.json
Converted 257095a8dab1423e98003cc877403f

### AVClass for labelling

In [5]:
!avclass -f vt_reports_ms_avclass/5aca67a44a4092bbba99da7c4c16e471397067cc5c615335f3079236752b7be7.json

[-] Using tagging rules in /Users/faithfulonwuegbuche/anaconda3/lib/python3.11/site-packages/avclass/data/default.tagging
[-] Using taxonomy in /Users/faithfulonwuegbuche/anaconda3/lib/python3.11/site-packages/avclass/data/default.taxonomy
[-] Using expansion tags in /Users/faithfulonwuegbuche/anaconda3/lib/python3.11/site-packages/avclass/data/default.expansion
[-] Processing input file vt_reports_ms_avclass/5aca67a44a4092bbba99da7c4c16e471397067cc5c615335f3079236752b7be7.json (vt3)
a46e1a26d754908d6b701050fd2d78f3	berbew
[-] 1 reports read
[-] Samples: 1 NoScans: 0 NoTags: 0 GroundTruth: 0


In [46]:
def extract_info(input_folder):
    data = {
        'sha256': [],
        'md5': [],
        'ransomware_family': []
    }

    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            file_path = os.path.join(input_folder, filename)
            # Run avclass command and capture its output
            result = subprocess.run(['avclass', '-f', file_path], capture_output=True, text=True).stdout.split('\n')
            
            # Extracting necessary information from the result list
            sha256 = os.path.splitext(filename)[0]
            md5, ransomware_family = result[-2].split('\t')  # Extract md5 and ransomware_family from the second last line
            
            # Append extracted information to data dictionary
            data['sha256'].append(sha256)
            data['md5'].append(md5)
            data['ransomware_family'].append(ransomware_family.strip())  # Strip whitespace from ransomware_family

    # Create a DataFrame with the collected data
    df = pd.DataFrame(data)
    return df

In [47]:
# Specify the input folder
input_folder = 'test'

# Extract information and create the DataFrame
df = extract_info(input_folder)

# Display the DataFrame
display(df)

Unnamed: 0,sha256,md5,ransomware_family
0,0a8bb770091b55f1b6c2afe412750a438c6f862870d44c...,420de2fc89d1e50138d55d2a5bb7f167,berbew
1,0a4d2758cae81d17f060e6d577557c4b2444cc38460f8c...,76898616082cdfc8f0f21a78cb2acde6,virlock
2,0a2d7f5355a328691dd2cbcbf58a4a1e67e9af8bcb4f26...,d05201e097960f87a79d177b655de183,lamer
3,0a0b6c1073ab6b8759d40a364b8552dd565c20a74f1250...,1a68c94f6b2d747e70c9a8624a5cb482,berbew
4,0a1f0111f4516001a002dba72405a30fc230a41b273b7c...,502226980cc43ae58921bb1b875bf2ca,virlock
5,0a4a295bec51a8fffe9335e5c7600dd63c359453ab17d9...,057017c7e9ce5af40f2812336546710b,berbew
6,0a6b929fbd2cdfe4ddfb16ced290b7a2b6d0b397c1cd97...,69ccca3b86f7782390fa3756e9adae3c,sivis


In [48]:
# Specify the input folder
input_folder = 'vt_reports_ms_avclass'

# Extract information and create the DataFrame
df = extract_info(input_folder)

# Display the DataFrame
display(df)

Unnamed: 0,sha256,md5,ransomware_family
0,54d760b47de0204e3af3305993ddeb964a99fe24901799...,715eb01e47b70ef98578723afe4c054d,lamer
1,bc60a3ff601aa5c429c18ceeacd9b8a5bbdfb7b1551356...,eda558767dd85d98461c23407cc04bf6,lamer
2,3b9a58b649dbfccfc04fe77b8b1da0b48e493dde87f7b5...,e68f08e8f8e274562d4e939a002524b9,sivis
3,73144d0cb1a5d491356aafd30a757a4f33ef9e0f52e093...,ed75ebbfd4b4bf6530486b95fed0468c,agentb
4,95ce618a4a3cb57c8b935ca7fc0dac03251651aed97e5f...,1ca838e77eee893841e3287ad226c7dd,lamer
...,...,...,...
7718,0145f04a8356780d52774ce5f7dd0a02f6d5b321694ed8...,eafe645b56c3f5cb746fb5f8504f6035,dalexis
7719,d389d062e1c314c050f99c49b00ba93e2679430bc38616...,b7187d53e4cf5165648e4d7e17585057,berbew
7720,fe4fe12f1edd8efcc0d0075caeacb5d1f7d16a00a2eb62...,9439f84ac4653086131b051c667649be,lamer
7721,faa706545212465c95b5f5e3b8b86be591d00d55601c0c...,18283ebe76fc046e236ee55e21e4f8c2,agenttesla


In [49]:
#df.to_csv("avclass_marauder.csv")

In [62]:
df_avclas_ma = pd.read_csv("avclass_marauder.csv")
df_avclas_ma.head()

Unnamed: 0.1,Unnamed: 0,sha256,md5,ransomware_family
0,0,54d760b47de0204e3af3305993ddeb964a99fe24901799...,715eb01e47b70ef98578723afe4c054d,lamer
1,1,bc60a3ff601aa5c429c18ceeacd9b8a5bbdfb7b1551356...,eda558767dd85d98461c23407cc04bf6,lamer
2,2,3b9a58b649dbfccfc04fe77b8b1da0b48e493dde87f7b5...,e68f08e8f8e274562d4e939a002524b9,sivis
3,3,73144d0cb1a5d491356aafd30a757a4f33ef9e0f52e093...,ed75ebbfd4b4bf6530486b95fed0468c,agentb
4,4,95ce618a4a3cb57c8b935ca7fc0dac03251651aed97e5f...,1ca838e77eee893841e3287ad226c7dd,lamer


In [63]:
df_avclas_ma.shape

(7723, 4)

In [65]:
df_fam = df.ransomware_family.value_counts()
df_fam

ransomware_family
berbew                                        2240
sivis                                          963
pajetbin                                       934
lamer                                          916
virlock                                        309
sfone                                          228
lmir                                           139
memery                                         133
gator                                          111
vobfus                                         106
ctsinf                                          90
shodi                                           72
mbrlock                                         66
wacatac                                         62
viking                                          56
crypmodadv                                      48
lebreat                                         46
salgorea                                        41
fsysna                                          40
stihat       

In [52]:
#df_fam.to_csv("avclass_marauder_family_value_counts.csv")

In [55]:
df_ma_ha = pd.read_csv("maurander_ha_full_info.csv")
df_ma_ha.head()

Unnamed: 0.1,Unnamed: 0,md5,sha1,sha256,av_detect,vx_family,classification_tags,tags,filename,verdict,threat_level,type,type_short
0,0,326946e1006d74c6fc55a64f5d33d458,97c61c01551a3908ab095e54464fe1549bb733d6,cb2d9eb6446cc3a16411044eecd5e2a18b0577dfa8e019...,87.0,Trojan.Generic,[],[],file,malicious,2.0,"PE32 executable (GUI) Intel 80386, for MS Windows","['peexe', 'executable']"
1,1,331ba167011b13c59fa38e694a8e0d6b,bed95ea1f1857a4c92a2e050291bb80b371a61a6,dc2e4666b4c27d4666be4f5f67edf89d34c90285ce8212...,77.0,Trojan.Generic,[],[],file,malicious,2.0,PE32 executable (GUI) Intel 80386 (stripped to...,"['peexe', 'executable']"
2,2,9fd0f55aa5246a42e8139df81fd17c80,8b19a64ded6e8445360184eba728c164353455d7,24f62ada2fe3cf7060a3c4527dd625a71e8deb4a2166a1...,91.0,Trojan.Generic,[],[],file,malicious,2.0,"PE32 executable (GUI) Intel 80386, for MS Windows","['peexe', 'executable']"
3,3,02c84765b7ea27115f0593001fcc9b66,affc5640f34e59ff1ac7ca4f88dcdc7d9baafde6,0800b479846c066f3e8aeba3349160b216be91a54107f3...,86.0,Adware.Gator,[],[],file,malicious,2.0,"PE32 executable (GUI) Intel 80386, for MS Windows","['peexe', 'executable']"
4,4,9b98aa208a7988ed87b49856705bcc77,9a8d087ce744a3ff9f838c2d3faf24ff37f6b0c3,393b4b291b9337859e7e172901004991bc8bbfc1f8cd27...,100.0,Win/malicious_confidence_100%,[],[],bounty-66098750164683636,malicious,2.0,"PE32 executable (GUI) Intel 80386, for MS Wind...","['peexe', 'executable']"


In [61]:
df_ma_ha.shape

(4677, 13)

In [57]:
# Set the display option to show all rows
pd.set_option('display.max_rows', None)


df_ma_ha.vx_family.value_counts()

vx_family
Trojan.Generic                               2602
Worm.AutoRun                                  349
Virlock.Generic                               177
Trojan.Agent                                  172
Trojan.Heur.Generic                           148
Win/malicious_confidence_100%                  90
worm.Generic                                   70
Trojan.FileInfector.Generic                    61
Virus.Syphilis                                 59
Win/grayware_confidence_100%                   58
Malware.Generic                                58
Trojan.Small                                   49
Agent.CP worm                                  45
Malware                                        39
Symmi.Generic                                  38
Fugrafa.Generic                                37
HLLP.Shodi                                     34
Memery.A virus                                 30
Zusy.Generic                                   27
Worm.Picsys                             

In [58]:
df_ma_ha.filename.value_counts()

filename
file                                                                                                                                4494
locker.exe                                                                                                                             2
Project1.exe                                                                                                                           2
bounty-92302687923477047                                                                                                               1
FreePass.exe                                                                                                                           1
bounty-97198928958824959                                                                                                               1
bounty-7287706093586034                                                                                                                1
0e66029132a885143b87b1e49e32663a

In [59]:
df_ma_ha.classification_tags.value_counts()

classification_tags
[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            4592
['ransomware']                                                                                                                                                                                                                                                                                                                                                   

In [60]:
df_ma_ha.tags.value_counts()

tags
[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     4588
['ransomware']                                                                                                                                                                                                                                                                                                                         