In [43]:
import os
import re
from langdetect import detect
import shutil
import pandas as pd

In [None]:
def read_report(filepath, filename):
    with open(os.path.join(filepath, filename), mode="r", encoding="utf-8") as f:
        report = f.read()
    return report

def write_report(filepath, filename, data):
    with open(os.path.join(filepath, filename), mode="w", encoding="utf-8") as f:
        f.write(data)

def find_failed_reports(data_path, failure_message):
    failed = []
    failed_urls = []
    for folder in os.listdir(data_path):
        folderpath = os.path.join(data_path, folder)
        for ref in os.listdir(folderpath):
            filepath = os.path.join(data_path, folder)
            report = read_report(filepath, ref)
            url = re.findall(r'https?://\S+', report)[0]
            if failure_message in report:
                failed_ref = folder + "-" + str(ref)
                failed.append(failed_ref)
                failed_urls.append(url)
            else:
                continue
    return failed, failed_urls

def seperate_non_english_reports(raw_data_path, language):
    folders_to_be_copied = []
    # Find non english reports and translate them
    for folder in os.listdir(raw_data_path):
        if folder in ["korean", "russian"]:
            continue
        folderpath = os.path.join(raw_data_path, folder)
        for ref in os.listdir(folderpath):
            report = read_report(folderpath, ref)
            # Split text into sentences (or use any other method)
            sentences = re.split(r'(?<=[.!?]) +', report)
            non_english = []
            for sentence in sentences:
                try:
                    lang = detect(sentence)
                    if lang == language:
                        non_english.append((sentence, lang))
                except:
                    break
            if non_english:
                folders_to_be_copied.append(folder)

    if language=="ko":
        language = "korean"
    elif language=="ru":
        language = "russian"

    folders_to_be_copied = list(set(folders_to_be_copied))

    if folders_to_be_copied:
        os.mkdir(os.path.join(raw_data_path, language))
    
    for f in folders_to_be_copied:
        shutil.copytree(os.path.join(raw_data_path, f),
                        os.path.join(raw_data_path, language, f))
        
def move_to_failed(data_path, report_type, failure_message):

    failed_path = os.path.join(data_path, "failed")
    failed_dest_path = os.path.join(failed_path, report_type)

    if not os.path.exists(failed_path):
        os.mkdir(failed_path)

    if not os.path.exists(failed_dest_path):
        os.mkdir(failed_dest_path)

    if report_type:
        data_source_path = os.path.join(data_path, report_type)
    else:
        raise Exception("Provide a correct report_type ('raw' or 'alienvault')")

    for report_name in os.listdir(data_source_path):
        folderpath = os.path.join(data_source_path, report_name)
        OK = 0
        for ref in os.listdir(folderpath):
            report = read_report(folderpath, ref)
            # Keep only valid reports
            if failure_message in report:
                continue
            else:
                OK += 1
        
        if not OK:
            src = os.path.join(data_source_path, report_name)
            dst = os.path.join(failed_dest_path, report_name)
            shutil.copytree(src=src,
                            dst=dst)
            shutil.rmtree(src)

def combine_results(formatted_data_path, failure_message, non_english_languages_in_reports):
    # Iterate over raw reports
    for report_name in os.listdir(formatted_data_path):
        if folderpath in non_english_languages_in_reports:
            continue
        folderpath = os.path.join(formatted_data_path, report_name)
        report_doc = ""
        # Iterate over refs of reports
        for filename in os.listdir(folderpath):
            ref = read_report(folderpath, filename)
            if failure_message in ref:
                continue
            elif "# Image URLs #" in ref:
                ref = ref.split("# Image URLs #")[0]
                report_doc += ref
            else:
                report_doc += ref
        write_report(folderpath, "report", report_doc)

def split_by_language(path):
    if "english" not in os.listdir(path):
        os.mkdir(os.path.join(path, "english"))
    
    for folder in os.listdir(path):
        if folder in ["english", "korean", "russian"]:
            continue

        if folder not in os.listdir(os.path.join(path, "korean")) and  folder not in os.listdir(os.path.join(path, "russian")):
            shutil.copytree(os.path.join(path, folder), os.path.join(path, "english", folder))

In [45]:
data_path = "C:/Users/AIAS/Documents/cti-model-training/data/reports_formatted"
raw_data_path = os.path.join(data_path, "raw")
alienvault_data_path = os.path.join(data_path, "alienvault")
failure_message = "REPORT IS NOT EXTRACTED! Reason caused the failure:"
non_english_languages_in_reports = ["korean", "russian"] 

In [None]:
# failed_av_list, failed_av_urls = find_failed_reports(alienvault_data_path, failure_message)
# failed_raw_list, failed_raw_urls = find_failed_reports(raw_data_path, failure_message)

In [84]:
# print(f"Failed alienvault references: {len(failed_av_list)}")
# print(f"Failed raw references: {len(failed_raw_list)}")

In [46]:
# Save failed in a new folder
# Failed raw reports are those with no reference at all

if False:
    move_to_failed(data_path=data_path,
                report_type="raw",
                failure_message=failure_message)

    move_to_failed(data_path=data_path,
                report_type="alienvault",
                failure_message=failure_message)

In [47]:
if False:
    combine_results(raw_data_path, failure_message)

In [48]:
# Delete failed alienvault references
if False:
    for f in failed_av_list:
        folder = f.split("-ref")[0]
        shutil.rmtree(os.path.join(alienvault_data_path, folder))

In [49]:
# Delete failed raw references
if False:
    for f in failed_raw_list:
        folder = f.split("-ref")[0]
        try:
            shutil.rmtree(os.path.join(raw_data_path, folder))
        except:
            continue

In [None]:
# Seperate reports by language
if False:
    seperate_non_english_reports(raw_data_path, language="ko")
    seperate_non_english_reports(raw_data_path, language="ru")

In [None]:
if False:
    split_by_language(raw_data_path)

In [56]:
48+17+2275

2340

In [57]:
def load_reports_in_dict(path):
    data = {}
    for filename in os.listdir(path):
        report = read_report(path, filename)
        data.update({filename.split(".txt")[0]:report})
    return data

data = load_reports_in_dict("C:/Users/AIAS/Documents/cti-model-training/data/reports")

In [58]:
def load_formatted_reports_in_dict(path):
    data = {}
    for foldername in os.listdir(path):
        filepath = os.path.join(path, foldername)
        report = read_report(filepath, "report")
        data.update({foldername:report})
    return data

formatted_data = load_formatted_reports_in_dict("C:/Users/AIAS/Documents/cti-model-training/data/reports_formatted/raw/english")


In [59]:
# Analyze images
report_names = pd.read_csv("C:/Users/AIAS/Documents/cti-model-training/data/opencti_reports_external_references.csv")["ID"].tolist()
img_analysis_dict = {
    "report":[],
    "cnt_formatted_imgs":[],
    "cnt_imgs":[]
}
for rn in report_names:
    if rn in formatted_data.keys() and rn in data.keys():

        formatted_report = formatted_data[rn]
        report = data[rn]

        cnt_formatted_imgs = len(re.findall(r'<!-- image -->', formatted_report, re.DOTALL))
        cnt_images = len(re.findall(r'<image>(.*?)</image>', report, re.DOTALL))

        img_analysis_dict["report"].append(rn)
        img_analysis_dict["cnt_formatted_imgs"].append(cnt_formatted_imgs)
        img_analysis_dict["cnt_imgs"].append(cnt_images)


In [60]:
pd.DataFrame(img_analysis_dict)

Unnamed: 0,report,cnt_formatted_imgs,cnt_imgs
0,ab7e9b32-96c3-4d99-9143-c0393e4a5f4e,8,0
1,71d86e1e-48bf-43c2-9b8d-d15edfc8105b,16,4
2,4c16f35a-7646-4f41-b962-c96916d3d3da,66,31
3,adb19ece-7fef-4845-925e-68e00f2cb6d0,26,0
4,7768c570-7502-42f8-9fe3-58789cd398f5,14,1
...,...,...,...
1247,2e8d943c-1670-4fd0-a4db-66af84013f09,18,14
1248,c3fe424f-6484-421a-b8f8-0fc3a9fa1a70,46,12
1249,8c5ed2c6-780e-4ae7-be0e-908a9921d9e4,26,5
1250,a1e7a45b-6bcc-464b-ac9a-53056054ba31,90,9


In [137]:
unmatched = []
for rn in report_names:
    if rn in formatted_data.keys() and rn in data.keys():
        match_case_1 = data[rn].split("Article Body:")[1][:25].strip()
        match_case_2 = data[rn].split("Article Body:")[1][25:50].strip()
        match_case_3 = data[rn].split("Article Body:")[1][50:75].strip()
        match_case_4 = data[rn].split("Article Body:")[1][75:100].strip()
        if match_case_1 not in formatted_data[rn] \
            and match_case_2 not in formatted_data[rn] \
                and match_case_3 not in formatted_data[rn] \
                    and match_case_4 not in formatted_data[rn]:
            # print(f"Match case: =>  {match_case}\n")
            # print(f"Formatted report: => {formatted_data[rn][:500]}\n")
            # print(f"\n\n{'='*100}\n\n")
            unmatched.append(rn)

print(len(unmatched))

58


In [88]:
# Match title
unmatched_titles = []
for rn in report_names:
    if rn in formatted_data.keys() and rn in data.keys():
        match_case = data[rn].split("Article Title:")[1].split("\n")[0][:40]
        match_case = match_case.strip()
        if match_case not in formatted_data[rn]:
            print(f"Report: => {rn}")
            print(f"Match case: =>  {match_case}\n")
            # print(f"Formatted report: => {formatted_data[k][:500]}")
            # print(f"\n\n{'='*100}\n\n")
            unmatched_titles.append(rn)

print(len(unmatched_titles))

Report: => 35ee06de-1ccd-462a-b6a8-81c254bd4b66
Match case: =>  A closer look at Eternity Malware

Report: => 02797bd3-3a72-4e84-ae6d-fd4fe573bcc9
Match case: =>  No Title Found

Report: => 000e110f-3b22-46e0-b7db-9f121d818236
Match case: =>  ERMAC Back In Action

Report: => f4f0da41-ebd3-430c-904f-c6dffb3d488f
Match case: =>  No Title Found

Report: => 14409b02-10a3-4f4a-98a1-13e463567dba
Match case: =>  Flying in the clouds: APT31 renews its

Report: => 20fc13be-ad67-4a8b-a9be-0aa4d7d9a675
Match case: =>  No Title Found

Report: => 461a9680-4c73-426a-b9bc-46cf1494146c
Match case: =>  Raspberry Robin and Dridex: Two birds o

Report: => b3f70f96-ba9c-46d6-a658-520a96f76f2f
Match case: =>  Monster Libra (TA551/Shathak) --> IcedI

Report: => 33a6e370-598a-4ba6-84ee-352cdfd3c8a7
Match case: =>  No Title Found

Report: => 0a181c9b-9e61-4ec8-b2c7-c9a9e38aa798
Match case: =>  No Title Found

Report: => 23261488-9043-4a85-bf3f-fd2c285163c6
Match case: =>  No Title Found

Report: => b0c01168-f

In [130]:
# Match images before title
imgs_before_title = []
for rn in report_names:
    if rn in formatted_data.keys() and rn in data.keys():
        match_case = "# " + data[rn].split("Article Title:")[1].split("\n")[0][:40].strip()
        if match_case in formatted_data[rn]:
            before_title = len(re.findall(r'<!-- image -->', 
                                          formatted_data[rn].split(match_case)[0]
                                          ))
            print(f"Report: => {rn}")
            print(f"Match case: =>  {match_case}")
            print(f"Number of images: => {before_title}\n\n")
            imgs_before_title.append(before_title)

print(pd.Series(imgs_before_title).describe())

Report: => ab7e9b32-96c3-4d99-9143-c0393e4a5f4e
Match case: =>  # Spring Vulnerability Update - Exploitat
Number of images: => 1


Report: => 71d86e1e-48bf-43c2-9b8d-d15edfc8105b
Match case: =>  # Moshen Dragon’s Triad-and-Error Approac
Number of images: => 0


Report: => 4c16f35a-7646-4f41-b962-c96916d3d3da
Match case: =>  # AvosLocker Ransomware Variant Abuses Dr
Number of images: => 0


Report: => adb19ece-7fef-4845-925e-68e00f2cb6d0
Match case: =>  # Fresh TOTOLINK Vulnerabilities Picked U
Number of images: => 0


Report: => 7768c570-7502-42f8-9fe3-58789cd398f5
Match case: =>  # Cicada: Chinese APT Group Widens Target
Number of images: => 2


Report: => a5c62472-8a18-4f17-be94-0481ee1f3558
Match case: =>  # Password-protected Excel spreadsheet pu
Number of images: => 1


Report: => d4848f81-4d06-4106-941c-6d1991c12baa
Match case: =>  # Windows MetaStealer Malware
Number of images: => 1


Report: => 02e3d6c6-7766-402a-a6a8-dce946b80bf7
Match case: =>  # FFDroider Stealer Targeting S

In [131]:
# Clean formatted
mm = [k for k, v in formatted_data.items() if "## Main menu" in v]
atu = [k for k, v in formatted_data.items() if "## About the Author" in v]

print(f"Main menu: {len(mm)}")
print(f"About the Author: {len(atu)}")

Main menu: 53
About the Author: 66


In [89]:
#print(data[rn])

In [91]:
# print(formatted_data[rn])

In [92]:
# # Compare failed raw reports with initial reports
# for f in failed_raw_list:
#     fname = f.split("-ref")[0] + ".txt"
#     if fname in os.listdir("C:/Users/AIAS/Documents/cti-model-training/data/reports"):
#         print(fname)