In [None]:
import os
import re
import json
from env import *
from whitelist import *

In [None]:

directory_path = text_path
output_path = json_path

In [None]:
def extract_references(content):
    reference_name_pattern = re.compile(r'CVE-\d{4}-\d{4,5}', re.IGNORECASE)
    return [match.upper() for match in re.findall(reference_name_pattern, content)]

# Custom rules for non-English filename
def extract_filenames(content):
    filename_pattern = re.compile(r'\b\w+\.[a-zA-Z]{2,4}\b')
    potential_filenames = re.findall(filename_pattern, content)
    cleaned_filenames = []
    for filename in potential_filenames:
        name, ext = filename.rsplit('.', 1)
        english_part = re.search('[a-zA-Z]+$', name)
        if english_part:
            name = english_part.group()
        cleaned_filenames.append(f"{name}.{ext}")
    return cleaned_filenames

def extract_md5(content): 
    md5_pattern = re.compile(r'\b[0-9a-fA-F]{32}\b')
    return re.findall(md5_pattern, content)

def extract_sha1(content): 
    sha1_pattern = re.compile(r'\b[0-9a-fA-F]{40}\b')
    return re.findall(sha1_pattern, content)

def extract_sha256(content): 
    sha256_pattern = re.compile(r'\b[0-9a-fA-F]{64}\b')
    return re.findall(sha256_pattern, content)

def extract_registry_entries(content):
    registry_pattern = re.compile(r'HK[A-Z_]+\\[^"\n]+')
    return re.findall(registry_pattern, content)

def extract_urls(content):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    # return re.findall(url_pattern, content)
     # Find all URLs
    urls = re.findall(url_pattern, content)
    
    # Remove [ and ] from the extracted URLs
    cleaned_urls = [url.replace("[", "").replace("]", "") for url in urls]
    
    return cleaned_urls


# weight is Dummy
def categorize_content(content, rule_id, filename):
    # Extracting data from content
    references = extract_references(content)
    samples = extract_filenames(content)

    return {
        'rule_id': rule_id,
        'name': filename[:-4],
        'description': "-",
        'references': extract_references(content),
        'samples': extract_filenames(content),
        'MD5' : extract_md5(content),
        'SHA1' : extract_sha1(content),
        'SHA256' : extract_sha256(content),
        'Registry_Entries': extract_registry_entries(content),
        'URLs': extract_urls(content),
        'weight': 0.0
    }

In [None]:
# Read text files
def get_text_files_from_directory(directory_path):
    return [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.txt')]

# Save json files
def write_to_json(output_path, content):
    with open(output_path, "w") as outfile:
        # json.dump(content, outfile)
        json.dump(content, outfile, ensure_ascii=False)

In [None]:
# Unique content for each files
def process_text_file(file_path, unique_sets, rule_id):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        filename = os.path.basename(file_path)
        categorized_content = categorize_content(content, rule_id, filename)

        categorized_content['references'] = list(set([x for x in categorized_content['references'] if x not in unique_sets['references']]))
        unique_sets['references'].update(categorized_content['references'])

        categorized_content['samples'] = list(set([x for x in categorized_content['samples'] if x not in unique_sets['samples'] and not any(whitelisted in x for whitelisted in whitelist_file_names)]))
        unique_sets['samples'].update(categorized_content['samples'])

        categorized_content['Registry_Entries'] = list(set([x for x in categorized_content['Registry_Entries'] if x not in unique_sets['Registry_Entries']]))
        unique_sets['Registry_Entries'].update(categorized_content['Registry_Entries'])

        categorized_content['MD5'] = list(set([x for x in categorized_content['MD5'] if x not in unique_sets['MD5']]))
        unique_sets['MD5'].update(categorized_content['MD5'])

        categorized_content['SHA1'] = list(set([x for x in categorized_content['SHA1'] if x not in unique_sets['SHA1']]))
        unique_sets['SHA1'].update(categorized_content['SHA1'])

        categorized_content['SHA256'] = list(set([x for x in categorized_content['SHA256'] if x not in unique_sets['SHA256']]))
        unique_sets['SHA256'].update(categorized_content['SHA256'])

        categorized_content['URLs'] = list(set([x for x in categorized_content['URLs'] if x not in unique_sets['URLs'] and not any(whitelisted in x for whitelisted in whitelist_urls)]))
        unique_sets['URLs'].update(categorized_content['URLs'])

        # merge into signatures category
        keys_to_combine = ['MD5', 'SHA1', 'SHA256', 'Registry_Entries', 'URLs']
        categorized_content['signatures'] = [item for key in keys_to_combine for item in categorized_content.get(key, [])]
        
        return categorized_content

In [None]:
# rule_id is auto incresement
def process_all_text_files(directory_path, output_path):
    text_files = get_text_files_from_directory(directory_path)
    unique_sets = {
        'references': set(),
        'samples': set(),
        'Registry_Entries': set(),
        'MD5': set(),
        'SHA1': set(),
        'SHA256': set(),
        'URLs': set()
    }
    rule_id = 0
    for file_path in text_files:
        rule_id += 1
        categorized_content = process_text_file(file_path, unique_sets, rule_id)
        # Toggle remove fields
        # for field in ['MD5', 'SHA1', 'SHA256', 'Registry_Entries', 'URLs']:
        #     categorized_content.pop(field, None)
        filename_without_extension = os.path.splitext(os.path.basename(file_path))[0]
        output_json_path = os.path.join(output_path, filename_without_extension + ".json")
        write_to_json(output_json_path, categorized_content)

In [None]:
# Excute
process_all_text_files(directory_path, output_path)
