In [3]:
import requests
import pandas as pd
import time
import yara
import os
import json
import os

In [5]:
VT_API_KEY = os.getenv("VT_API_KEY")
OTX_API_KEY = os.getenv("OTX_API_KEY")
HA_API_KEY = os.getenv("HA_API_KEY")


HEADERS_VT = {'x-apikey': VT_API_KEY}
HEADERS_OTX = {'X-OTX-API-KEY': OTX_API_KEY}
HEADERS_HA = {
    'User-Agent': 'Falcon Sandbox',
    'api-key': HA_API_KEY
}

In [5]:
def consulta_vt(hash_val):

    url = f'https://www.virustotal.com/api/v3/files/{hash_val}'
    
    r = requests.get(url, headers=HEADERS_VT)
    time.sleep(5)
    
    if r.status_code != 200:
        return {}

    data = r.json().get('data', {}).get('attributes', {})

    signature_data = data.get('signature_info', {})
    is_signed = bool(signature_data)
    signer = signature_data.get('signer', signature_data.get('publisher', 'Desconocido')) if is_signed else None

    pe_data = data.get('pe_info', {})
    ts = pe_data.get('timestamp')
    sections = pe_data.get('sections', [])
    entropy_avg = round(sum(s.get('entropy', 0) for s in sections) / len(sections), 2) if sections else None

    sandbox_data = data.get('sandbox_verdicts', {})
    malicious_count = 0
    confidences = []
    classifications = set()

    for verdict in sandbox_data.values():
        if verdict.get('category') == 'malicious':
            malicious_count += 1
            if 'confidence' in verdict:
                confidences.append(verdict['confidence'])
            classifications.update(verdict.get('malware_classification', []))

    confidence_avg = round(sum(confidences) / len(confidences), 2) if confidences else None

    return {
        'threat_label': data.get('popular_threat_classification', {}).get('suggested_threat_label', ''),
        'creation_date': data.get('creation_date'),
        'size': data.get('size'),
        'tags_vt': data.get('tags', []),
        'meaningful_name': data.get('meaningful_name'),
        'malicious_ratio': sum(
            1 for res in data.get('last_analysis_results', {}).values()
            if res.get('category') == 'malicious'
        ) / max(len(data.get('last_analysis_results', {})), 1),

        # Nuevos campos
        'sandbox_malicious_count': malicious_count,
        'sandbox_confidence_avg': confidence_avg,
        'sandbox_classifications': list(classifications),
        'is_signed': is_signed,
        'signature_entity': signer,
        'pe_compile_date': ts,
        'pe_entropy_avg': entropy_avg
    }


def consulta_otx(hash_val):

    url = f"https://otx.alienvault.com/api/v1/indicators/file/{hash_val}/general"
    r = requests.get(url, headers=HEADERS_OTX)
    time.sleep(1.5)
    
    if r.status_code != 200:
        return {}

    data = r.json()
    pulse_info = data.get('pulse_info', {})
    pulses = pulse_info.get('pulses', [])

    return {
        'pulse_names': {p.get('name', '') for p in pulses},
        'pulse_count': pulse_info.get('count', 0),
        'adversaries': list({p.get('adversary') for p in pulses if p.get('adversary')}),
        'attack_ids': list({(a.get('id'), a.get('name')) for p in pulses for a in p.get('attack_ids', []) if a.get('id') and a.get('name')})
    }


def consulta_ha_overview(hash_val):
    url = f"https://www.hybrid-analysis.com/api/v2/overview/{hash_val}/summary"
    r = requests.get(url, headers=HEADERS_HA)
    time.sleep(15)
    
    if r.status_code != 200:
        return {}

    data = r.json()
    return {
        'verdict': data.get('verdict'),
        'multiscan_result': data.get('multiscan_result')
    }


## Creaci√≥n del dataset

In [28]:
df = pd.read_csv("merged_df.csv")
df = df[["SHA256", "Type"]]
df = df.groupby("Type").sample(n=50, random_state=42)
df.to_csv("selection.csv", index=False)

In [29]:
df = pd.read_csv("selection.csv")
df.head()

Unnamed: 0,SHA256,Type
0,cefe56eca3f6f351bc5aa1f986bc79f7491569ebc0258e...,0
1,f8a9fc780819a979d769d45c11d35d3baacb01c821158d...,0
2,eaa81490aeb4d86cd59e44af07ec6fe63a9028eb4a6fe7...,0
3,a7e446ed3a6cb2a171792baa90a66a068826957e7f3b32...,0
4,d4493087abe2a048f24d87ae232ac2ce90329662348555...,0


In [31]:
df.value_counts("Type")

Type
0    50
1    50
2    50
3    50
4    50
5    50
6    50
Name: count, dtype: int64

In [47]:
df[df["Type"] == 0]

Unnamed: 0,SHA256,Type
0,cefe56eca3f6f351bc5aa1f986bc79f7491569ebc0258e...,0
1,f8a9fc780819a979d769d45c11d35d3baacb01c821158d...,0
2,eaa81490aeb4d86cd59e44af07ec6fe63a9028eb4a6fe7...,0
3,a7e446ed3a6cb2a171792baa90a66a068826957e7f3b32...,0
4,d4493087abe2a048f24d87ae232ac2ce90329662348555...,0
5,af535d2577e5b48270fa8af03579b8fba2176d4e3e777c...,0
6,b25d1854c0b95f892fa7f6de9658764b26943fa12cfa7e...,0
7,ee1ae8300352e58ef980329cf6599a9d3b4754665c7097...,0
8,1f49ba3bc8f2f78a2d683c02b34023b61b3bf3464fc15b...,0
9,e8abe70d2294a902ee96299255538b85d6803fa2a1563c...,0


In [67]:
tipo_objetivo = 6
subset = df[df["Type"] == tipo_objetivo]

resultados = []

for hash_val in subset["SHA256"]:
    print(f"Procesando: {hash_val}...")

    vt = consulta_vt(hash_val)
    time.sleep(5)

    otx = consulta_otx(hash_val)
    time.sleep(1.5)

    ha = consulta_ha_overview(hash_val)
    time.sleep(15)

    resultados.append({
        "SHA256": hash_val,
        "Type": tipo_objetivo,
        **vt,
        **otx,
        **ha
    })

salida_csv = f"inteligencia_{tipo_objetivo}.csv"
pd.DataFrame(resultados).to_csv(salida_csv, index=False)
print(f"Guardado: {salida_csv}")


Procesando: 390cba6e7af5c2487bb1d3ace4edc3c255a45fc9a3aff277cfd16b85e73dff00...
Procesando: c7bf4126fec26b76a237fee7d8426dc80cf46f261e699c7be6d512ffaacf4de2...
Procesando: ac6c658a35572e520bfb84ee03ceda62c3f3b43155ffabf13b53b3dca27e0e00...
Procesando: 7dab6265884f4bd65eb1e781201e0f3711c2e39188129df4bf3921ac0042e59c...
Procesando: 81cae546ba8f6dd7e3273f9ac9ef35e37c953e745a1d66d8aaf5a69a89555524...
Procesando: 58ce2309dea30b153cda70566249d5781f60da3658383e6a743fca38fe524d62...
Procesando: 782df41da291585e83635dff168dd75451ca5e103a922a27c174c01830efbb20...
Procesando: a1a8e11f5b032b4f125d02935278564ee15aff0f457b0b5e984180f5bd5036c3...
Procesando: 0883932effebc38003004a83a80fbdbf2caab3a8467d59237791087872920468...
Procesando: ba6eaf83dd24b3d773bad4917c8bb6bdd1a8466e98451dbd930fdd393446f55d...
Procesando: 3101f92f4de424e1f9f0445afdbc99e36f4889b42d00e161e4cd578ab3c9550d...
Procesando: c4c59e5531e9151055fd7ca74809d859a615398b3d39b62dcbd34112c4643b9a...
Procesando: 53b707f4c2eb23ee162105c7c39b

Segunda remesa de datos para guardar.

In [13]:
merged_df = pd.read_csv("merged_df.csv")
merged_df = merged_df[["SHA256", "Type"]]
selection_df = pd.read_csv("selection.csv")

In [15]:
used_hashes = set(selection_df["SHA256"].dropna().unique())
disponibles_df = merged_df[~merged_df["SHA256"].isin(used_hashes)]

new_selection_list = []
for tipo in disponibles_df["Type"].unique():
    subset = disponibles_df[disponibles_df["Type"] == tipo]
    muestra_tipo = subset.sample(n=50, random_state=42)
    new_selection_list.append(muestra_tipo)

new_selection = pd.concat(new_selection_list, ignore_index=True)
new_selection.to_csv("new_selection.csv", index=False)

print(f"Seleccionadas {len(new_selection)} muestras.")


Seleccionadas 350 muestras.


In [15]:
df = pd.read_csv("new_selection.csv")

In [17]:
tipo_objetivo = 6
subset = df[df["Type"] == tipo_objetivo]

resultados = []

for hash_val in subset["SHA256"]:
    print(f"Procesando: {hash_val}...")

    vt = consulta_vt(hash_val)
    time.sleep(5)

    otx = consulta_otx(hash_val)
    time.sleep(1.5)

    ha = consulta_ha_overview(hash_val)
    time.sleep(15)

    resultados.append({
        "SHA256": hash_val,
        "Type": tipo_objetivo,
        **vt,
        **otx,
        **ha
    })

salida_csv = f"inteligencia_{tipo_objetivo}_1.csv"
pd.DataFrame(resultados).to_csv(salida_csv, index=False)
print(f"Guardado: {salida_csv}")


Procesando: 5e8bb41fd9e9a6cd124c882fa939949e25475bccec5d033f26c25e27021eba06...
Procesando: cf9821c4c08a6d62cbe17ebc6c0d6ea40336c145e8e9369fe76505e1d3dc8674...
Procesando: d1fea52507fc97ff419f8dd2ea8ecf689fb7c066cf8f18453378e95bf399c3d6...
Procesando: bc8dfe6ea7e6b90398e6fe7f7c433cb2d70c9ac37f15798b6d344bc5bc5c5ac8...
Procesando: b592b0cc63b45b9587244e101336b1d24bff029abc90e6fe8a87c47610b59bd3...
Procesando: 67b9e43619deb0b1a6825a1a3282cf57f2312f49b7a3d24a54c57160f1885d13...
Procesando: 674c31406f1b7623041521778bae1e86373bd88616d619733a4c6c9e499e46de...
Procesando: c8cd4eef93aa0fa040a566ff4d9b877cfc4b42a423caa2d2b76a7bdc89d6db18...
Procesando: 64430368874f0d720a2bb31e091b1986fc2ff4dfb581d812a05fa6dda0083230...
Procesando: 7d6d2955cb741cef4fc04a58dcade355450ebcc27003afcd93f17a9df1ea810d...
Procesando: aeb9da3417456ca0869aceca99bda1678bb39eb084f15867ea6cfc6de68bb912...
Procesando: 9d19c6e678c823c2c0e75fb23280ae1308a01f52e36e7e327014502e5f907f1a...
Procesando: c3b2da904aaa469f386afda53067