Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Commit

Permalink
Add alert_conect and connect to the specific assset mentioned
Browse files Browse the repository at this point in the history
  • Loading branch information
aloftus23 committed May 1, 2022
1 parent 3b9f55b commit 63ee6e2
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 68 deletions.
111 changes: 83 additions & 28 deletions src/pe_source/cybersixgill.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date, datetime, timedelta
import logging
import sys
import traceback

from .data.pe_db.db_query import (
get_breaches,
Expand All @@ -19,8 +20,10 @@
from .data.sixgill.source import (
alerts,
alias_organization,
all_assets_list,
creds,
cve_summary,
get_alerts_content,
mentions,
root_domains,
top_cves,
Expand All @@ -29,9 +32,10 @@
# Set todays date formatted YYYY-MM-DD and the start_date 30 days prior
TODAY = date.today()
DAYS_BACK = timedelta(days=30)
START_DATE = str(TODAY - DAYS_BACK)
MENTIONS_DAYS_BACK = timedelta(days=16)
MENTIONS_START_DATE = str(TODAY - MENTIONS_DAYS_BACK)
END_DATE = str(TODAY)
DATE_SPAN = f"[{START_DATE} TO {END_DATE}]"
DATE_SPAN = f"[{MENTIONS_START_DATE} TO {END_DATE}]"

# Set dates to YYYY-MM-DD H:M:S format
NOW = datetime.now()
Expand Down Expand Up @@ -127,6 +131,39 @@ def get_alerts(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
except Exception as e:
logging.error("Failed fetching alert data for %s", org_id)
logging.error(e)
print(traceback.format_exc())
return 1

# Get Alert content
try:
logging.info("Fetching alert content data for %s.", org_id)
# Fetch organization assets
org_assets_dict = all_assets_list(sixgill_org_id)
print(org_assets_dict)
for i, row in alerts_df.iterrows():
try:
alert_id = row["sixgill_id"]
content_snip, asset_mentioned, asset_type = get_alerts_content(
sixgill_org_id, alert_id, org_assets_dict
)
alerts_df.at[i, "content_snip"] = content_snip
alerts_df.at[i, "asset_mentioned"] = asset_mentioned
alerts_df.at[i, "asset_type"] = asset_type
except Exception as e:
logging.error(
"Failed fetching a specific alert content for %s", org_id
)
logging.error(e)
print(traceback.format_exc())
alerts_df.at[i, "content_snip"] = ""
alerts_df.at[i, "asset_mentioned"] = ""
alerts_df.at[i, "asset_type"] = ""
print(alerts_df["asset_mentioned"])

except Exception as e:
logging.error("Failed fetching alert content for %s", org_id)
logging.error(e)
print(traceback.format_exc())
return 1

# Insert alert data into the PE database
Expand Down Expand Up @@ -159,6 +196,7 @@ def get_mentions(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
mentions_df["data_source_uid"] = source_uid
except Exception as e:
logging.error("Failed fetching mentions for %s", org_id)
print(traceback.format_exc())
logging.error(e)
return 1

Expand Down Expand Up @@ -195,33 +233,50 @@ def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
return 1

# Change empty and ambiguous breach names
creds_df.loc[
creds_df["breach_name"] == "", "breach_name"
] = "Cybersixgill_" + creds_df["breach_id"].astype(str)

creds_df.loc[
creds_df["breach_name"] == "Automatic leaked credentials detection",
"breach_name",
] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
creds_breach_df = creds_df[
["breach_name", "description", "breach_date", "password", "data_source_uid"]
].reset_index()

# Create password_included column
creds_breach_df["password_included"] = creds_breach_df["password"] != ""

# Group breaches and count the number of credentials
count_creds = creds_breach_df.groupby(
[
try:
creds_df.loc[
creds_df["breach_name"] == "", "breach_name"
] = "Cybersixgill_" + creds_df["breach_id"].astype(str)

creds_df.loc[
creds_df["breach_name"] == "Automatic leaked credentials detection",
"breach_name",
"description",
"breach_date",
"password_included",
"data_source_uid",
]
).size()
creds_breach_df = count_creds.to_frame(name="exposed_cred_count").reset_index()
creds_breach_df["modified_date"] = creds_breach_df["breach_date"]
] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
creds_breach_df = creds_df[
[
"breach_name",
"description",
"breach_date",
"password",
"data_source_uid",
]
].reset_index()

# Create password_included column
creds_breach_df["password_included"] = creds_breach_df["password"] != ""

# Group breaches and count the number of credentials
count_creds = creds_breach_df.groupby(
[
"breach_name",
"description",
"breach_date",
"password_included",
"data_source_uid",
]
).size()
creds_breach_df = count_creds.to_frame(
name="exposed_cred_count"
).reset_index()
creds_breach_df["modified_date"] = creds_breach_df["breach_date"]
creds_breach_df.drop_duplicates(
subset=["breach_name"], keep="first", inplace=True
)
except Exception as e:
logging.error("Probably no credential breaches for %s", org_id)
print(creds_df)
logging.error(e)
return 1

# Insert breach data into the PE database
try:
Expand Down
68 changes: 49 additions & 19 deletions src/pe_source/data/pe_db/db_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,33 +97,63 @@ def get_data_source_uid(source):
def insert_sixgill_alerts(df):
"""Insert sixgill alert data."""
conn = connect()
df = df[
[
"alert_name",
"content",
"date",
"sixgill_id",
"read",
"severity",
"site",
"threat_level",
"threats",
"title",
"user_id",
"category",
"lang",
"organizations_uid",
"data_source_uid",
try:
df = df[
[
"alert_name",
"content",
"date",
"sixgill_id",
"read",
"severity",
"site",
"threat_level",
"threats",
"title",
"user_id",
"category",
"lang",
"organizations_uid",
"data_source_uid",
"content_snip",
"asset_mentioned",
"asset_type",
]
]
except Exception as e:
logging.error(e)
df = df[
[
"alert_name",
"content",
"date",
"sixgill_id",
"read",
"severity",
"site",
"threat_level",
"threats",
"title",
"user_id",
"organizations_uid",
"data_source_uid",
"content_snip",
"asset_mentioned",
"asset_type",
]
]
]
table = "alerts"
# Create a list of tuples from the dataframe values
tuples = [tuple(x) for x in df.to_numpy()]
# Comma-separated dataframe columns
cols = ",".join(list(df.columns))
# SQL query to execute
query = """INSERT INTO {}({}) VALUES %s
ON CONFLICT (sixgill_id) DO NOTHING;"""
ON CONFLICT (sixgill_id) DO UPDATE SET
content = EXCLUDED.content,
content_snip = EXCLUDED.content_snip,
asset_mentioned = EXCLUDED.asset_mentioned,
asset_type = EXCLUDED.asset_type;"""
cursor = conn.cursor()
try:
extras.execute_values(
Expand Down
29 changes: 29 additions & 0 deletions src/pe_source/data/sixgill/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Cybersixgill API calls."""
# Standard Python Libraries
import logging

# Third-Party Libraries
import pandas as pd
import requests
Expand Down Expand Up @@ -93,6 +96,32 @@ def alerts_count(organization_id):
return resp


def alerts_content(organization_id, alert_id):
"""Get total alert content."""
url = f"https://api.cybersixgill.com/alerts/actionable_alert_content/{alert_id}"
auth = cybersix_token()
headers = {
"Content-Type": "application/json",
"Cache-Control": "no-cache",
"Authorization": "Bearer " + auth,
}
payload = {"organization_id": organization_id, "limit": 10000}
content = requests.get(url, headers=headers, params=payload).json()
try:
content = content["content"]["items"][0]
if "_source" in content:
content = content["_source"]["content"]
elif "description" in content:
content = content["description"]
print(content)
else:
content = ""
except Exception as e:
logging.error("Failed getting content snip: %s", e)
content = ""
return content


def dve_top_cves(size):
"""Get data about a specific CVE."""
url = "https://api.cybersixgill.com/dve_enrich/top_cves"
Expand Down
70 changes: 49 additions & 21 deletions src/pe_source/data/sixgill/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import requests

from .api import (
alerts_content,
alerts_count,
alerts_list,
credential_auth,
Expand All @@ -25,6 +26,20 @@ def alias_organization(org_id):
return aliases


def all_assets_list(org_id):
"""List an organization's aliases."""
assets = org_assets(org_id)
df_assets = pd.DataFrame(assets)
aliases = df_assets["organization_aliases"].loc["explicit":].tolist()[0]
alias_dict = dict.fromkeys(aliases, "alias")
domain_names = df_assets["domain_names"].loc["explicit":].tolist()[0]
domain_dict = dict.fromkeys(domain_names, "domain")
ips = df_assets["ip_addresses"].loc["explicit":].tolist()[0]
ip_dict = dict.fromkeys(ips, "ip")
assets_dict = {**alias_dict, **domain_dict, **ip_dict}
return assets_dict


def root_domains(org_id):
"""Get root domains."""
assets = org_assets(org_id)
Expand All @@ -39,7 +54,7 @@ def mentions(date, aliases):
for mention in aliases:
mentions += '"' + mention + '"' + ","
mentions = mentions[:-1]
query = "site:forum_* AND date:" + date + " AND " + "(" + str(mentions) + ")"
query = "date:" + date + " AND " + "(" + str(mentions) + ")"
logging.info("Query:")
logging.info(query)
count = 1
Expand All @@ -57,26 +72,15 @@ def mentions(date, aliases):

i = 0
all_mentions = []
if count_total < 10000:
while i < count_total:
# Recommended "from" and "result_size" is 50. The maximum is 400.
resp = intel_post(query, frm=i, scroll=False, result_size=200)
i += 200
logging.info("Getting %s of %s....", i, count_total)
intel_items = resp["intel_items"]
df_mentions = pd.DataFrame.from_dict(intel_items)
all_mentions.append(df_mentions)
df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)
else:
while i < count_total:
# Recommended "from" and "result_size" is 50. The maximum is 400.
resp = intel_post(query, frm=i, scroll=True, result_size=400)
i += 400
logging.info("Getting %s of %s....", i, count_total)
intel_items = resp["intel_items"]
df_mentions = pd.DataFrame.from_dict(intel_items)
all_mentions.append(df_mentions)
df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)
while i < count_total:
# Recommended "from" and "result_size" is 50. The maximum is 400.
resp = intel_post(query, frm=i, scroll=False, result_size=50)
i += 50
logging.info("Getting %s of %s....", i, count_total)
intel_items = resp["intel_items"]
df_mentions = pd.DataFrame.from_dict(intel_items)
all_mentions.append(df_mentions)
df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)

return df_all_mentions

Expand All @@ -97,9 +101,33 @@ def alerts(org_id):
all_alerts.append(df_alerts)
df_all_alerts = pd.concat(all_alerts).reset_index(drop=True)

# Fetch the full content of each alert
# for i, r in df_all_alerts.iterrows():
# print(r["id"])
# content = alerts_content(org_id, r["id"])
# df_all_alerts.at[i, "content"] = content

return df_all_alerts


def get_alerts_content(organization_id, alert_id, org_assets_dict):
"""Get alert content snippet."""
asset_mentioned = ""
snip = ""
asset_type = ""
content = alerts_content(organization_id, alert_id)
if content:
for asset, type in org_assets_dict.items():
if asset in content:
index = content.index(asset)
snip = content[(index - 100) : (index + len(asset) + 100)]
snip = "..." + snip + "..."
asset_mentioned = asset
asset_type = type
logging.info("Asset mentioned: %s", asset_mentioned)
return snip, asset_mentioned, asset_type


def top_cves(size):
"""Top 10 CVEs mentioned in the dark web."""
resp = dve_top_cves(size)
Expand Down

0 comments on commit 63ee6e2

Please sign in to comment.