In [1]:
import pandas as pd
import zipfile
from pathlib import Path
import json
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
DF_INDEX_ZIP_FILE_PATH = Path("index.zip")
DF_INDEX_CSV_FILE_PATH = Path("index.csv")
MITRE_LABELS_JSON_PATH = Path("MITRE_labels.json")

In [3]:
with zipfile.ZipFile(DF_INDEX_ZIP_FILE_PATH, 'r') as zip_ref:
    zip_ref.extractall(DF_INDEX_CSV_FILE_PATH.parent)


In [4]:
index_df = pd.read_csv(DF_INDEX_CSV_FILE_PATH)
index_df

Unnamed: 0.1,Unnamed: 0,git_repo,year,filename,group_name,software_name,technique_name,tactic
0,0,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[]
1,1,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[]
2,2,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[]
3,3,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[]
4,4,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[]
...,...,...,...,...,...,...,...,...
657719,657719,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[]
657720,657720,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[]
657721,657721,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[]
657722,657722,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[]


In [5]:
labels = ["group_name", "software_name", "technique_name", "tactic"]
print(f"Total lines {len(index_df)}")
for label in labels:
    print(f"lines with {label}: {len(index_df[index_df[label] != str([])])}")

Total lines 657724
lines with group_name: 11132
lines with software_name: 16492
lines with technique_name: 8132
lines with tactic: 8474


In [6]:
year_df = pd.DataFrame()

year_df["year"]= index_df["year"].value_counts(sort=True).sort_index().index
year_df["count"] = index_df["year"].value_counts(sort=True).sort_index().values
year_df["%"] = index_df["year"].value_counts(sort=True, normalize=True).sort_index().values


year_df

Unnamed: 0,year,count,%
0,2006,251,0.0
1,2008,4725,0.01
2,2009,5916,0.01
3,2010,10706,0.02
4,2011,7948,0.01
5,2012,17644,0.03
6,2013,58956,0.09
7,2014,72915,0.11
8,2015,74365,0.11
9,2016,94384,0.14


In [7]:
verbose_index_df = index_df.copy()

with Path(MITRE_LABELS_JSON_PATH).open("r", encoding="utf8") as f:
    mitre_labels = json.load(f)
    
def check_if_label_exists(label, line):
    if label in line:
        return True
    return False

values_list = []
for key in mitre_labels:
    for label_value in mitre_labels[key]:
        values_list.append(label_value)
        verbose_index_df[label_value] = index_df[key].str.contains(label_value)

verbose_index_df

Unnamed: 0.1,Unnamed: 0,git_repo,year,filename,group_name,software_name,technique_name,tactic,Energetic Bear,Chinastrats,...,Defense Evasion,Collection,Credential Access,Execution,Lateral Movement,Discovery,Command and Control,Persistence,Exfiltration,Privilege Escalation
0,0,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,APT_CyberCriminal_Campagin_Collections,2018,MuddyWater_Middle_East_and_Central_Asia.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657719,657719,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
657720,657720,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
657721,657721,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False
657722,657722,APT_CyberCriminal_Campagin_Collections,2008,army-bans-usb-d.pdf.txt,[],[],[],[],False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
print("verbose_index_df.shape", verbose_index_df.shape)

value_counter = dict()
for label_value in values_list:
    value_counter[label_value] = [verbose_index_df[label_value].sum()]

with pd.option_context('display.max_rows', 500, 'display.max_columns', 10):
    display(pd.DataFrame(value_counter).transpose())


verbose_index_df.shape (657724, 476)


Unnamed: 0,0
Energetic Bear,49
Chinastrats,4
Gamaredon Group,76
HIDDEN COBRA,34
MONSOON,79
Fancy Bear,57
Stone Panda,21
Buckeye,70
Threat Group-1314,0
Poseidon Group,20
