In [2]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

base_url = "http://dfoh.uclouvain.be/database/cases/"
dfoh_cases = "/home/savymik/Pictures/dfoh_cases"
os.makedirs(dfoh_cases, exist_ok=True)

def get_filelink():
    response = requests.get(base_url)
    if response.status_code != 200:
        raise Exception(f"Failed!, status code {response.status_code}")
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True)]
    valid_cases = [link for link in links if link and not link.endswith('.tmp') and '/cases/' in link]
    valid_cases = [base_url + link.split("/cases/")[-1] for link in valid_cases]
    return valid_cases
  
def cases_downloads():
    cases_links = get_filelink()
    for case_url in cases_links:
        fname = case_url.split("/")[-1]
        fpath = os.path.join(dfoh_cases, fname)
        if os.path.exists(fpath):
            print(f"File exists! Skipping: {fname}")
            continue
        # print(f"Downloading: {fname}")
        response = requests.get(case_url, stream=True)
        if response.status_code == 200:
            with open(fpath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            # print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download {case_url}, status code {response.status_code}")

def parse_dfoh_cases():
    for fname in os.listdir(dfoh_cases):
        fpath = os.path.join(dfoh_cases, fname)
        if not os.path.isfile(fpath):
            continue
        # print(f"Processing: {fname}")
        df = parse_dfoh_file(fpath)
        if df.empty:
            print(f"No valid data extracted for: {fname}, skip saving!")
            continue
        csv_fname = os.path.join(dfoh_cases, fname + ".csv")
        df.to_csv(csv_fname, index=False)
        # print(f"Saved: {csv_fname}")

def parse_dfoh_file(fpath):
    if not os.path.exists(fpath):
        print(f"File does not exist: {fpath}, skip parsing!")
        return pd.DataFrame()
    date = os.path.basename(fpath)[:10]
    cases = []
    with open(fpath, 'r') as f:
        lines = f.readlines()
    status = None
    attacker = None
    victim = None
    additional_attributes = {}
    
    for line in lines:
        parts = line.strip().split()
        if not parts:
            continue 
        if line.startswith("!leg") or line.startswith("!sus"):
            status = "leg" if line.startswith("!leg") else "sus"
            if len(parts) < 6:
                print(f"Incomplete line: {line.strip()}, skipping!")
                continue
            attacker, victim = parts[1], parts[2]
            leg_count, sus_count, path_count = map(int, parts[3:6])
            metadata_parts = line.split("attackers:")[-1].strip().split(";")
            additional_attributes = {k: v for k, v in (x.split(":") for x in metadata_parts if ":" in x)}
        else:
            if len(parts) < 5:
                print(f"Incomplete inference line: {line.strip()}, skipping!")
                continue
            try:
                inference_id, leg_flag, sus_flag = map(int, parts[2:5])
            except ValueError:
                print(f"Non-int inference line: {line.strip()}, skipping!")
                continue
            cases.append({"date": date,"status": status,"attacker": attacker,"victim": victim,"leg_infrence_cnts": leg_count,"sus_infrence_cnts": sus_count,
                            "path_cnts": path_count,"type": additional_attributes.get("type", "unknown"),"valid_origin": additional_attributes.get("valid_origin", "unknown"),
                            "recurrent": additional_attributes.get("recurrent", "unknown"),"local": additional_attributes.get("local", "unknown"),"inference_id": inference_id,"leg_flag": leg_flag,"sus_flag": sus_flag
                            })
    return pd.DataFrame(cases)

def main():
    cases_downloads()
    parse_dfoh_cases()

if __name__ == "__main__":
    main()
