VirusTotal | Shodan | GreyNoise API Scripting

In [1]:
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, concat, col
import requests
import shodan

Creating Spark Object:

In [2]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

Creating Data Frames for each of the brolog csvs:

In [3]:
df_conn = spark.read.csv("conn.log.csv", inferSchema = True, header = True)
df_dhcp = spark.read.csv("dhcp.log.csv", inferSchema = True, header = True)
df_dns = spark.read.csv("dns.log.csv", inferSchema = True, header = True)
df_files = spark.read.csv("files.log.csv", inferSchema = True, header = True)
df_http = spark.read.csv("http.log.csv", inferSchema = True, header = True)

The following is my attempt at "caching" of all dataframes for the most relevant pieces of information to pass on to the APIs. Supplementally, I also save the passed information into a text file as well.

Instead of a dynamic programming like approach to check for already API passed data, I am creating a unique set of values to pass on, therefore saving both space and time by creating structures that will only have to be checked once and never again.

My approach:
Create individual dataframes of unique values in column --> Convert to lists to iterate --> pass on values to apis

In [4]:
#Create dataframes for important information

#important http columns
http_ip_response = df_http.select("id_resp_h").distinct()
http_domain = df_http.select("host").distinct()
http_url = df_http.select(concat(col("host"),col("uri")).alias("Full_URL")).distinct()

#important connection columns
conn_ip_origin = df_conn.select("id_orig_h").distinct()
conn_ip_response = df_conn.select("id_resp_h").distinct()

#important dns columns
dns_ip_origin = df_dns.select("id_orig_h").distinct()
dns_ip_response = df_dns.select("id_resp_h").distinct()
dns_domain = df_dns.select("query").distinct()

#important files columns
files_ip = df_files.select("tx_hosts").distinct()
files_md5 = df_files.select("md5").distinct()
files_sha1 = df_files.select("sha1").distinct()


Convert columns to list to iterate through

In [5]:
print("Total number of unique items to pass to api\n")

#IPs
http_ip_response = [row["id_resp_h"] for row in http_ip_response.collect() if "192.168" not in row["id_resp_h"]]
conn_ip_origin = [row["id_orig_h"] for row in conn_ip_origin.collect() if "192.168" not in row["id_orig_h"]]
conn_ip_response = [row["id_resp_h"] for row in conn_ip_response.collect() if "192.168" not in row["id_resp_h"]]
dns_ip_origin = [row["id_orig_h"] for row in dns_ip_origin.collect() if "192.168" not in row["id_orig_h"]]
dns_ip_response = [row["id_resp_h"] for row in dns_ip_response.collect() if "192.168" not in row["id_resp_h"]]
files_ip = [row["tx_hosts"] for row in files_ip.collect()]
#final combination and filtering of duplicate IPs and splitting into ipv4 and ipv6
ip_list = list(set(http_ip_response + conn_ip_origin + conn_ip_response + dns_ip_origin + dns_ip_response +files_ip))
ip4_list = [ip for ip in ip_list if "." in ip]
ip6_list = [ip for ip in ip_list if ":" in ip]
print("IPs: {}\nIP4s: {}\nIP6s: {}".format(len(ip_list), len(ip4_list), len(ip6_list)))

#File Hashes
files_md5 = [row["md5"] for row in files_md5.collect()]
files_sha1 = [row["sha1"] for row in files_sha1.collect()]
print("MD5: {}\nSHA1: {}".format(len(files_md5), len(files_sha1)))

#URLs
http_url = [row["Full_URL"] for row in http_url.collect()]
print("URLs: {}".format(len(http_url)))

#Domains
http_domain = [row["host"] for row in http_domain.collect()]
dns_domain = [row["query"] for row in dns_domain.collect()]
#final combination and filtering of duplicate domains
domain_list = list(set(http_domain + dns_domain))
print("Domains: {}".format(str(len(domain_list))))


Total number of unique items to pass to api

IPs: 51
IP4s: 13
IP6s: 38
MD5: 4
SHA1: 4
URLs: 7
Domains: 1603


VirusTotal API Calls

In [6]:
import time
endpoint_prefix = "https://www.virustotal.com/vtapi/v2/"
apikey = "ddc8c30b31e3bcbf5f7b67b576ec22f153a79884c2c074bf069bcf411d7256a5"

def ip_check(apikey, ip):
    url = endpoint_prefix + "ip-address/report"
    params = {'apikey':apikey, 'ip':ip}
    response = requests.get(url, params=params)
    return "{}: {}".format(ip,str(response.json()))

def sha1_check(apikey, sha1):
    url = endpoint_prefix + "file/report"
    params = {'apikey':apikey, 'resource':sha1}
    response = requests.get(url, params=params)
    return "{}: {}".format(sha1,str(response.json()))
    
def md5_check(apikey, md5):
    url = endpoint_prefix + "file/report"
    params = {'apikey':apikey, 'resource':md5}
    response = requests.get(url, params=params)
    return "{}: {}".format(md5,str(response.json()))

def url_check(apikey, passed_url):
    url = endpoint_prefix + "url/report"
    params = {'apikey':apikey, 'resource':passed_url, 'allinfo':True, 'scan':1}
    response = requests.get(url, params=params)
    return "{}: {}".format(passed_url,str(response.json()))

def domain_check(apikey, domain):
    url = endpoint_prefix + "domain/report"
    params = {'apikey':apikey, 'resource':domain}
    response = requests.get(url, params=params)
    return "{}: {}".format(domain,str(response.json()))

with open("Saved Info.txt", 'a') as file:
    for i in domain_list:
        try:
            file.write("{}\n".format(url_check(apikey,i)))
            time.sleep(15)
        except Error as e:
            print(e)
    for i in ip4_list:
        try:
            file.write("{}\n".format(ip_check(apikey,i)))
            time.sleep(15)
        except Error as e:
            print(e)
    for i in files_md5:
        try:
            file.write("{}\n".format(md5_check(apikey,i)))
            time.sleep(15)
        except Error as e:
            print(e)
    for i in files_sha1:
        try:
            file.write("{}\n".format(sha1_check(apikey,i)))
            time.sleep(15)
        except Error as e:
            print(e)
    for i in http_url:
        try:
            file.write("{}\n".format(url_check(apikey,i)))
            time.sleep(15)    
        except Error as e:
            print(e)

Finding the IOCs

In [6]:
import re
import json

with open("Saved Info.txt", 'r') as file1:
    line = file1.readline()
    text = {}
    while line:
        if re.search("'positives': [1-9].",line):
            end = line.find('{')
            text[line[0:end-2]] = eval(line[end:])
        line = file1.readline()

sha256_IOCs = set()
url_IOCs = set()

for dictionary in text.values():
    for key,value in dictionary.items():
        if key == 'detected_referrer_samples':
            for i in value:
                check = eval(str(i))
                if check['positives'] > 0:
                    sha256_IOCs.add(check['sha256'])    
        if key == 'detected_urls':
            for url_info in value:
                check = eval(str(url_info))
                if check['positives'] > 0:
                    url_IOCs.add(check['url'])
        if key == 'detected_downloaded_samples':
            for sample_info in value:
                check = eval(str(sample_info))
                if check['positives'] > 0:
                    sha256_IOCs.add(check['sha256'])
                    


print("Number of malicious samples found at IPs or referred by VirusTotal: {}".format(len(sha256_IOCs)))
print("Number of latest URLs seen under the domain or IP address being studied: {}".format(len(url_IOCs)))

print(url_IOCs)
print(sha256_IOCs)

Number of malicious samples found at IPs or referred by VirusTotal: 597
Number of latest URLs seen under the domain or IP address being studied: 488
{'http://mafund.cn/News/62263.html', 'http://07.super5566.com/report/', 'http://blissfullyshare.biz/', 'https://nortonovl.ns01.us/', 'http://smic-school.cn/123/20160228802.shtml', 'http://www.icanhazip.com/', 'http://05.microsoftcloudserver.com/86.exe', 'http://servizio-clinete-bacnoposta.otzo.com/', 'http://ethdigitalcampus.com/2iC3sFF', 'http://freedown.xyz/', 'http://www.customercares.de/novus/', 'https://07.super5566.com/', 'http://ftp.ukraine.https443.org/', 'http://dyru.ajisainyc.com/pagjfut54.php', 'http://netfli-x.digital/', 'https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-151-1/tools/gh0st_decode.py', 'http://vsetut.biz/', 'https://appleorderconfirm1.servehttp.com/', 'https://34d050488e89441e.top/', 'http://sexyjapan.ddns.info/', 'http://mafund.cn/News/37337.html', 'http://zgrshy06.zyns.com/', 'http://crabbs.so