Add alert_conect and connect to the specific assset mentioned

cisagov · May 1, 2022 · 63ee6e2 · 63ee6e2
1 parent 3b9f55b
commit 63ee6e2
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 68 deletions.
diff --git a/src/pe_source/cybersixgill.py b/src/pe_source/cybersixgill.py
@@ -4,6 +4,7 @@
 from datetime import date, datetime, timedelta
 import logging
 import sys
+import traceback
 
 from .data.pe_db.db_query import (
     get_breaches,
@@ -19,8 +20,10 @@
 from .data.sixgill.source import (
     alerts,
     alias_organization,
+    all_assets_list,
     creds,
     cve_summary,
+    get_alerts_content,
     mentions,
     root_domains,
     top_cves,
@@ -29,9 +32,10 @@
 # Set todays date formatted YYYY-MM-DD and the start_date 30 days prior
 TODAY = date.today()
 DAYS_BACK = timedelta(days=30)
-START_DATE = str(TODAY - DAYS_BACK)
+MENTIONS_DAYS_BACK = timedelta(days=16)
+MENTIONS_START_DATE = str(TODAY - MENTIONS_DAYS_BACK)
 END_DATE = str(TODAY)
-DATE_SPAN = f"[{START_DATE} TO {END_DATE}]"
+DATE_SPAN = f"[{MENTIONS_START_DATE} TO {END_DATE}]"
 
 # Set dates to YYYY-MM-DD H:M:S format
 NOW = datetime.now()
@@ -127,6 +131,39 @@ def get_alerts(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
         except Exception as e:
             logging.error("Failed fetching alert data for %s", org_id)
             logging.error(e)
+            print(traceback.format_exc())
+            return 1
+
+        # Get Alert content
+        try:
+            logging.info("Fetching alert content data for %s.", org_id)
+            # Fetch organization assets
+            org_assets_dict = all_assets_list(sixgill_org_id)
+            print(org_assets_dict)
+            for i, row in alerts_df.iterrows():
+                try:
+                    alert_id = row["sixgill_id"]
+                    content_snip, asset_mentioned, asset_type = get_alerts_content(
+                        sixgill_org_id, alert_id, org_assets_dict
+                    )
+                    alerts_df.at[i, "content_snip"] = content_snip
+                    alerts_df.at[i, "asset_mentioned"] = asset_mentioned
+                    alerts_df.at[i, "asset_type"] = asset_type
+                except Exception as e:
+                    logging.error(
+                        "Failed fetching a specific alert content for %s", org_id
+                    )
+                    logging.error(e)
+                    print(traceback.format_exc())
+                    alerts_df.at[i, "content_snip"] = ""
+                    alerts_df.at[i, "asset_mentioned"] = ""
+                    alerts_df.at[i, "asset_type"] = ""
+            print(alerts_df["asset_mentioned"])
+
+        except Exception as e:
+            logging.error("Failed fetching alert content for %s", org_id)
+            logging.error(e)
+            print(traceback.format_exc())
             return 1
 
         # Insert alert data into the PE database
@@ -159,6 +196,7 @@ def get_mentions(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
             mentions_df["data_source_uid"] = source_uid
         except Exception as e:
             logging.error("Failed fetching mentions for %s", org_id)
+            print(traceback.format_exc())
             logging.error(e)
             return 1
 
@@ -195,33 +233,50 @@ def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid):
             return 1
 
         # Change empty and ambiguous breach names
-        creds_df.loc[
-            creds_df["breach_name"] == "", "breach_name"
-        ] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
-
-        creds_df.loc[
-            creds_df["breach_name"] == "Automatic leaked credentials detection",
-            "breach_name",
-        ] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
-        creds_breach_df = creds_df[
-            ["breach_name", "description", "breach_date", "password", "data_source_uid"]
-        ].reset_index()
-
-        # Create password_included column
-        creds_breach_df["password_included"] = creds_breach_df["password"] != ""
-
-        # Group breaches and count the number of credentials
-        count_creds = creds_breach_df.groupby(
-            [
+        try:
+            creds_df.loc[
+                creds_df["breach_name"] == "", "breach_name"
+            ] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
+
+            creds_df.loc[
+                creds_df["breach_name"] == "Automatic leaked credentials detection",
                 "breach_name",
-                "description",
-                "breach_date",
-                "password_included",
-                "data_source_uid",
-            ]
-        ).size()
-        creds_breach_df = count_creds.to_frame(name="exposed_cred_count").reset_index()
-        creds_breach_df["modified_date"] = creds_breach_df["breach_date"]
+            ] = "Cybersixgill_" + creds_df["breach_id"].astype(str)
+            creds_breach_df = creds_df[
+                [
+                    "breach_name",
+                    "description",
+                    "breach_date",
+                    "password",
+                    "data_source_uid",
+                ]
+            ].reset_index()
+
+            # Create password_included column
+            creds_breach_df["password_included"] = creds_breach_df["password"] != ""
+
+            # Group breaches and count the number of credentials
+            count_creds = creds_breach_df.groupby(
+                [
+                    "breach_name",
+                    "description",
+                    "breach_date",
+                    "password_included",
+                    "data_source_uid",
+                ]
+            ).size()
+            creds_breach_df = count_creds.to_frame(
+                name="exposed_cred_count"
+            ).reset_index()
+            creds_breach_df["modified_date"] = creds_breach_df["breach_date"]
+            creds_breach_df.drop_duplicates(
+                subset=["breach_name"], keep="first", inplace=True
+            )
+        except Exception as e:
+            logging.error("Probably no credential breaches for %s", org_id)
+            print(creds_df)
+            logging.error(e)
+            return 1
 
         # Insert breach data into the PE database
         try:

diff --git a/src/pe_source/data/pe_db/db_query.py b/src/pe_source/data/pe_db/db_query.py
@@ -97,33 +97,63 @@ def get_data_source_uid(source):
 def insert_sixgill_alerts(df):
     """Insert sixgill alert data."""
     conn = connect()
-    df = df[
-        [
-            "alert_name",
-            "content",
-            "date",
-            "sixgill_id",
-            "read",
-            "severity",
-            "site",
-            "threat_level",
-            "threats",
-            "title",
-            "user_id",
-            "category",
-            "lang",
-            "organizations_uid",
-            "data_source_uid",
+    try:
+        df = df[
+            [
+                "alert_name",
+                "content",
+                "date",
+                "sixgill_id",
+                "read",
+                "severity",
+                "site",
+                "threat_level",
+                "threats",
+                "title",
+                "user_id",
+                "category",
+                "lang",
+                "organizations_uid",
+                "data_source_uid",
+                "content_snip",
+                "asset_mentioned",
+                "asset_type",
+            ]
+        ]
+    except Exception as e:
+        logging.error(e)
+        df = df[
+            [
+                "alert_name",
+                "content",
+                "date",
+                "sixgill_id",
+                "read",
+                "severity",
+                "site",
+                "threat_level",
+                "threats",
+                "title",
+                "user_id",
+                "organizations_uid",
+                "data_source_uid",
+                "content_snip",
+                "asset_mentioned",
+                "asset_type",
+            ]
         ]
-    ]
     table = "alerts"
     # Create a list of tuples from the dataframe values
     tuples = [tuple(x) for x in df.to_numpy()]
     # Comma-separated dataframe columns
     cols = ",".join(list(df.columns))
     # SQL query to execute
     query = """INSERT INTO {}({}) VALUES %s
-    ON CONFLICT (sixgill_id) DO NOTHING;"""
+    ON CONFLICT (sixgill_id) DO UPDATE SET
+    content = EXCLUDED.content,
+    content_snip = EXCLUDED.content_snip,
+    asset_mentioned = EXCLUDED.asset_mentioned,
+    asset_type = EXCLUDED.asset_type;"""
     cursor = conn.cursor()
     try:
         extras.execute_values(

diff --git a/src/pe_source/data/sixgill/api.py b/src/pe_source/data/sixgill/api.py
@@ -1,4 +1,7 @@
 """Cybersixgill API calls."""
+# Standard Python Libraries
+import logging
+
 # Third-Party Libraries
 import pandas as pd
 import requests
@@ -93,6 +96,32 @@ def alerts_count(organization_id):
     return resp
 
 
+def alerts_content(organization_id, alert_id):
+    """Get total alert content."""
+    url = f"https://api.cybersixgill.com/alerts/actionable_alert_content/{alert_id}"
+    auth = cybersix_token()
+    headers = {
+        "Content-Type": "application/json",
+        "Cache-Control": "no-cache",
+        "Authorization": "Bearer " + auth,
+    }
+    payload = {"organization_id": organization_id, "limit": 10000}
+    content = requests.get(url, headers=headers, params=payload).json()
+    try:
+        content = content["content"]["items"][0]
+        if "_source" in content:
+            content = content["_source"]["content"]
+        elif "description" in content:
+            content = content["description"]
+            print(content)
+        else:
+            content = ""
+    except Exception as e:
+        logging.error("Failed getting content snip: %s", e)
+        content = ""
+    return content
+
+
 def dve_top_cves(size):
     """Get data about a specific CVE."""
     url = "https://api.cybersixgill.com/dve_enrich/top_cves"

diff --git a/src/pe_source/data/sixgill/source.py b/src/pe_source/data/sixgill/source.py
@@ -8,6 +8,7 @@
 import requests
 
 from .api import (
+    alerts_content,
     alerts_count,
     alerts_list,
     credential_auth,
@@ -25,6 +26,20 @@ def alias_organization(org_id):
     return aliases
 
 
+def all_assets_list(org_id):
+    """List an organization's aliases."""
+    assets = org_assets(org_id)
+    df_assets = pd.DataFrame(assets)
+    aliases = df_assets["organization_aliases"].loc["explicit":].tolist()[0]
+    alias_dict = dict.fromkeys(aliases, "alias")
+    domain_names = df_assets["domain_names"].loc["explicit":].tolist()[0]
+    domain_dict = dict.fromkeys(domain_names, "domain")
+    ips = df_assets["ip_addresses"].loc["explicit":].tolist()[0]
+    ip_dict = dict.fromkeys(ips, "ip")
+    assets_dict = {**alias_dict, **domain_dict, **ip_dict}
+    return assets_dict
+
+
 def root_domains(org_id):
     """Get root domains."""
     assets = org_assets(org_id)
@@ -39,7 +54,7 @@ def mentions(date, aliases):
     for mention in aliases:
         mentions += '"' + mention + '"' + ","
     mentions = mentions[:-1]
-    query = "site:forum_* AND date:" + date + " AND " + "(" + str(mentions) + ")"
+    query = "date:" + date + " AND " + "(" + str(mentions) + ")"
     logging.info("Query:")
     logging.info(query)
     count = 1
@@ -57,26 +72,15 @@ def mentions(date, aliases):
 
     i = 0
     all_mentions = []
-    if count_total < 10000:
-        while i < count_total:
-            # Recommended "from" and "result_size" is 50. The maximum is 400.
-            resp = intel_post(query, frm=i, scroll=False, result_size=200)
-            i += 200
-            logging.info("Getting %s of %s....", i, count_total)
-            intel_items = resp["intel_items"]
-            df_mentions = pd.DataFrame.from_dict(intel_items)
-            all_mentions.append(df_mentions)
-            df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)
-    else:
-        while i < count_total:
-            # Recommended "from" and "result_size" is 50. The maximum is 400.
-            resp = intel_post(query, frm=i, scroll=True, result_size=400)
-            i += 400
-            logging.info("Getting %s of %s....", i, count_total)
-            intel_items = resp["intel_items"]
-            df_mentions = pd.DataFrame.from_dict(intel_items)
-            all_mentions.append(df_mentions)
-            df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)
+    while i < count_total:
+        # Recommended "from" and "result_size" is 50. The maximum is 400.
+        resp = intel_post(query, frm=i, scroll=False, result_size=50)
+        i += 50
+        logging.info("Getting %s of %s....", i, count_total)
+        intel_items = resp["intel_items"]
+        df_mentions = pd.DataFrame.from_dict(intel_items)
+        all_mentions.append(df_mentions)
+        df_all_mentions = pd.concat(all_mentions).reset_index(drop=True)
 
     return df_all_mentions
 
@@ -97,9 +101,33 @@ def alerts(org_id):
         all_alerts.append(df_alerts)
         df_all_alerts = pd.concat(all_alerts).reset_index(drop=True)
 
+    # Fetch the full content of each alert
+    # for i, r in df_all_alerts.iterrows():
+    #     print(r["id"])
+    #     content = alerts_content(org_id, r["id"])
+    #     df_all_alerts.at[i, "content"] = content
+
     return df_all_alerts
 
 
+def get_alerts_content(organization_id, alert_id, org_assets_dict):
+    """Get alert content snippet."""
+    asset_mentioned = ""
+    snip = ""
+    asset_type = ""
+    content = alerts_content(organization_id, alert_id)
+    if content:
+        for asset, type in org_assets_dict.items():
+            if asset in content:
+                index = content.index(asset)
+                snip = content[(index - 100) : (index + len(asset) + 100)]
+                snip = "..." + snip + "..."
+                asset_mentioned = asset
+                asset_type = type
+                logging.info("Asset mentioned: %s", asset_mentioned)
+    return snip, asset_mentioned, asset_type
+
+
 def top_cves(size):
     """Top 10 CVEs mentioned in the dark web."""
     resp = dve_top_cves(size)