<a href="https://colab.research.google.com/github/darpanrewani05/import_file/blob/main/practical_2_importing_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Synthetic malware-style CSV generator (harmless, for research/testing only)
# Paste into Google Colab and run.

import pandas as pd
import random
import hashlib
from datetime import datetime, timedelta
import os

random.seed(42)
N = 5000

families = ["Emotet", "TrickBot", "Ryuk", "Zeus", "AgentTesla", "Dridex", "Cerber", "AZORult", "Mirai", "Unknown"]
file_types = ["PE32", "PE32+", "Script", "PDF", "DOC", "EXE", "DLL"]
av_sample_names = ["ESET", "Kaspersky", "Norton", "McAfee", "Bitdefender", "Windows Defender", "Avast", "AVG", "Sophos"]

domains = ["example.com", "update-service.net", "cdn-files.org", "malicious-downloads.ru", "tracker-site.info"]
ips = [f"192.0.2.{i}" for i in range(1, 255)]


def rand_date(start, end):
    delta = end - start
    rnd = random.randint(0, delta.days)
    return (start + timedelta(days=rnd)).strftime("%Y-%m-%d")


start_date = datetime(2018, 1, 1)
end_date = datetime(2025, 11, 30)

rows = []

for i in range(1, N + 1):

    fname = f"sample_{i:05d}." + random.choice(["exe", "dll", "bin", "pdf", "doc", "js"])

    seed = f"{fname}-{random.random()}-{i}".encode("utf-8")
    md5 = hashlib.md5(seed).hexdigest()
    sha1 = hashlib.sha1(seed).hexdigest()
    sha256 = hashlib.sha256(seed).hexdigest()

    fsize = random.randint(1024, 50_000_000)
    ftype = random.choice(file_types)

    first_seen = rand_date(start_date, end_date)
    last_seen = rand_date(datetime.strptime(first_seen, "%Y-%m-%d"), end_date)

    family = random.choice(families)
    label = random.choice(["malicious", "benign", "suspicious"])

    detections_total = 60
    detections_hit = random.randint(0, detections_total)
    detection_ratio = f"{detections_hit}/{detections_total}"

    avs = ";".join(random.sample(av_sample_names, k=random.randint(1, 4)))

    ips_list = ";".join(random.sample(ips, k=random.randint(0, 2)))
    doms = ";".join(random.sample(domains, k=random.randint(0, 2)))

    tags = ";".join(
        random.sample(
            [
                "ransomware",
                "trojan",
                "downloader",
                "info-stealer",
                "botnet",
                "dropper",
                "credential-stealer",
                "persistence",
                "obfuscated",
            ],
            k=random.randint(1, 3),
        )
    )

    behavior = random.choice(
        [
            "Creates persistence and communicates with C2.",
            "Downloads additional payloads and injects into explorer.exe.",
            "Exfiltrates credential files from user directories.",
            "Spawns multiple threads and launches DDoS traffic.",
            "Drops encrypted payload and deletes original file.",
            "Launches PowerShell to fetch remote script.",
            "Modifies registry keys for autostart.",
            "No suspicious behavior detected during sandboxing.",
        ]
    )

    entropy = round(random.uniform(3.0, 8.0), 3)

    imported_funcs = ";".join(
        random.sample(
            [
                "CreateFile",
                "WriteFile",
                "ReadFile",
                "InternetOpen",
                "WinExec",
                "LoadLibrary",
                "GetProcAddress",
                "VirtualAlloc",
                "VirtualProtect",
                "URLDownloadToFile",
            ],
            k=random.randint(1, 5),
        )
    )

    pe_sections = random.randint(3, 10)
    packed = random.choice([True, False])

    rows.append(
        {
            "id": i,
            "file_name": fname,
            "md5": md5,
            "sha1": sha1,
            "sha256": sha256,
            "file_size_bytes": fsize,
            "file_type": ftype,
            "first_seen": first_seen,
            "last_seen": last_seen,
            "family": family,
            "label": label,
            "detection_ratio": detection_ratio,
            "av_detections": avs,
            "c2_ips": ips_list,
            "domains": doms,
            "tags": tags,
            "behavior_summary": behavior,
            "entropy": entropy,
            "imported_functions": imported_funcs,
            "pe_sections_count": pe_sections,
            "packed": packed,
        }
    )


df = pd.DataFrame(rows)

# Save file
outpath = "malware_dataset_synthetic_5000.csv"
df.to_csv(outpath, index=False)

print("CSV created at:", outpath)
df.head(10)


CSV created at: malware_dataset_synthetic_5000.csv


Unnamed: 0,id,file_name,md5,sha1,sha256,file_size_bytes,file_type,first_seen,last_seen,family,...,detection_ratio,av_detections,c2_ips,domains,tags,behavior_summary,entropy,imported_functions,pe_sections_count,packed
0,1,sample_00001.js,23f719c21b5696b0620c0a6646b7bd7b,cb1a03f29706dbb4b5a2f4f98eb401cdfcaad574,c1770fb3872f7563abe6dee8ad0ddba692c8e5082aaabe...,49765635,Script,2020-09-30,2021-12-31,Ryuk,...,6/60,Avast,,,info-stealer,Spawns multiple threads and launches DDoS traf...,5.527,VirtualProtect,6,False
1,2,sample_00002.dll,f94f347a6f5c410cd02255e68874c3a2,113c0cd1ffd4f1dd8688d4f5559c59885ac2189d,cc10db3b542934b1a58b51b1659b34a17367b6d84ead8d...,18670086,DLL,2018-01-27,2019-11-11,Cerber,...,17/60,McAfee;Windows Defender,,,trojan;dropper,Launches PowerShell to fetch remote script.,6.019,VirtualAlloc,4,False
2,3,sample_00003.exe,6158fe94cb88082aba31c9951dfab8d3,3e37cd7e8d27aaa1a77ac38cc474758c1ee1a162,bdf413c36f6969db8273c2f65744a4f159b506eac51325...,42188326,DOC,2022-01-21,2025-04-17,Zeus,...,4/60,McAfee,192.0.2.253,,trojan,Modifies registry keys for autostart.,4.39,ReadFile;LoadLibrary;VirtualProtect,6,False
3,4,sample_00004.js,bc1507a509bc7da38b931c82100c66cb,71fd4f636bd292b634a963508718c31624350fdb,d14e07685e29da613270464da0d0b1973679fb4b8bb925...,43489942,PE32,2024-10-31,2025-09-21,Ryuk,...,46/60,Norton;AVG,192.0.2.70,tracker-site.info;update-service.net,dropper;ransomware;trojan,Creates persistence and communicates with C2.,7.025,WinExec;WriteFile;InternetOpen;URLDownloadToFile,8,True
4,5,sample_00005.js,f909f8d4e94810683c139bc673f496e1,dcc1156659a6fdd71dfbdcc42c5399e78fec26a7,ddbb526d075817e1ccf0acc1c70bbd26076c0506383c58...,43142082,PDF,2019-08-09,2022-07-28,Ryuk,...,47/60,Avast;Sophos;Norton,,,persistence;trojan;credential-stealer,Creates persistence and communicates with C2.,7.306,ReadFile;GetProcAddress,4,False
5,6,sample_00006.pdf,501177028e7b053876bbe2b5881972d7,6c89597f57aa1d2af4238901cd9b6a0bb1688f14,8d30c8fd5a3da3f026751a950a21b3497d07529e332b4e...,31411320,DOC,2020-10-26,2023-12-03,Emotet,...,46/60,Sophos,192.0.2.197,cdn-files.org;example.com,credential-stealer;downloader,No suspicious behavior detected during sandbox...,3.016,VirtualProtect;ReadFile;WriteFile,7,True
6,7,sample_00007.dll,9fe15e6dfb74e19b3fb87639ea79eac8,caa1df94330b48fe07dc116b9a14c5392c563fce,267553923f21334161c03ef3a74d94d3fa15ae3c196d26...,10842396,DOC,2023-12-13,2023-12-13,Unknown,...,31/60,Kaspersky,192.0.2.225,update-service.net,info-stealer,Downloads additional payloads and injects into...,3.428,WriteFile;VirtualProtect;ReadFile;URLDownloadT...,10,True
7,8,sample_00008.bin,3ee957ff0fc14e7372d03af45e226397,6a30f5a49113c0bccf115ba89d614b611353fa5c,5e68030edba98c790e4dd240318e2a4ce21580575cba10...,40708852,PDF,2020-05-17,2025-08-01,Mirai,...,44/60,Bitdefender;Avast,192.0.2.167;192.0.2.96,tracker-site.info,trojan;info-stealer,Spawns multiple threads and launches DDoS traf...,3.32,URLDownloadToFile,6,True
8,9,sample_00009.exe,a6bcc7ec475444a33b343f6f7f0fc00b,fb7ef10029e0f54d876e5e4d6287ce76ee863fae,9103eaae992d0fbd3eb07aa1864acdf91b66d912138e46...,42353626,PE32,2020-07-26,2020-12-11,Emotet,...,4/60,Bitdefender;AVG,,update-service.net;malicious-downloads.ru,persistence,Modifies registry keys for autostart.,3.952,GetProcAddress,8,False
9,10,sample_00010.pdf,f48d0f88d10c0b4e6e14127f7bfc3cc0,2956b428b8d9c791ac6d1a4c31aef011e0aa3395,197cd773239c5c1ce32ddb0ed71d7a5a3edc090a44d963...,48928476,PE32,2025-07-21,2025-08-15,Emotet,...,46/60,Kaspersky;McAfee;Sophos,,malicious-downloads.ru;update-service.net,downloader;botnet,No suspicious behavior detected during sandbox...,4.249,VirtualAlloc,4,True
