In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import os
import re
import time
import requests
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Method to scrap the infection data from the html

In [3]:
def get_infections(source_html):
    """Method that takes the source_html from the selenium driver and parses the infections listed.
       Returns a dataframe with the information scrapped."""
    
    soup = BeautifulSoup(source_html, 'html5lib')
    list_soup = soup.select('li[class="clearfix"]')
    list_processed_infections = []

    for val in list_soup:
        series_infection = []
        series_infection.append(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"))
        if u'\xa0' in val.text:
            pass
        else:
            for soup_infection in val.find_all('div', {'class':['col-md-1', 'col-md-2', 'col-md-3']}):
                series_infection.append(soup_infection.text)

            list_processed_infections.append(series_infection)

    df_infections = pd.DataFrame(list_processed_infections)
    df_infections.columns = ['Datetime', 'Infection Time', 'Botnet Name', 'Originating ISP', 'OrgID'
                             , 'Country', 'City']
    return df_infections

- Create driver where we will extract the data from the 'Looking Glass Cyber' site.

In [4]:
def load_driver_Firefox():
    """Method that instanciates the WebDriver that will be scrapping the data from the Looking Glass Cyber site."""
    
    driver = webdriver.Firefox()
    time.sleep(0.5)
    driver.maximize_window()

    driver.get('https://map.lookingglasscyber.com/')
    time.sleep(5)

    #Process to enable the layer from which the data is to be scrapped.
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div[1]/div[1]/div[3]/button/span')))
    driver.find_element_by_xpath('/html/body/div[2]/div[1]/div[1]/div[3]/button/span').click()
    time.sleep(1)

    driver.find_element_by_xpath('//*[@id="display_checkbox"]').click()
    time.sleep(1.5)

    driver.find_element_by_xpath('/html/body/div[2]/div[1]/div[1]/div[3]/button/span').click()
    time.sleep(1)
    
    return driver

In [5]:
def download_infection_data():
    """Method that downloads the infections data to a CSV. It processes the page_source continuously and saves
       the results to a CSV when the dataframe memory exceeds 10 million bytes."""
    df_data = pd.DataFrame()
    
    print('Initiating download of threat data, please wait...')
    while df_data.memory_usage(index=True).sum() < 10000000:
        df_download = get_infections(driver.page_source)
        df_data = df_data.append(df_download, ignore_index=True)
        time.sleep(.1)
        if len(df_data) % 2200 == 0:
            print(f'Current size in memory: {df_data.memory_usage(index=True).sum()}')

    #TODO: Cleanup possible duplicates
    df_data.columns = ['Datetime', 'Infection Time', 'Botnet Name', 'Originating ISP', 'OrgID', 'Country', 'City']
    df_data.to_csv(f'.\\data\\LookingGlass_Data_{datetime.datetime.today().strftime("%Y-%m-%d_%H%M%S")}.csv')
    print(f'Created file with {df_data.memory_usage(index=True).sum()} bytes.')

In [6]:
def parse_CSV():
    """Method that will parse all the written CSV's and convert them to a DataFrame.
       Returns a DataFrame with all the scrapped data for the Looking Glass Cyber site"""
    
    df_threats = pd.DataFrame()

    for file in os.listdir('.\data'):
        df_in = pd.read_csv('.\\data\\' + file)
        df_threats = df_threats.append(df_in, ignore_index=True)

    df_threats.columns = ['tempId', 'Datetime', 'Infection Time', 'Botnet Name', 'Originating ISP', 'OrgID', 'Country', 'City']
    df_threats = df_threats[1:][['Datetime', 'Infection Time', 'Botnet Name', 'Originating ISP', 'OrgID', 'Country', 'City']].reset_index(drop=True)
    
    return df_threats

In [None]:
driver = load_driver_Firefox()

for i in range(6):
    download_infection_data()
    
driver.quit()

df_threats = parse_CSV()
len(df_threats)
df_threats.head(10)

Initiating download of threat data, please wait...


# Methods to extend the threats data getting information from other sites to enrich the results.

In [None]:
df_threats[['ASN', 'ASN_Name']] = df_threats['OrgID'].str.split(' ',1,expand=True)

df_threats.head()

In [None]:
df_asn = pd.DataFrame(df_threats['ASN'].unique(), columns=['ASN'])
len(df_asn)
df_asn.sort_values(by='ASN', ascending=True, inplace=True)
df_asn.head()

In [None]:
for i in range(len(df_asn)):
    if df_asn.iloc[i,0] != '-':
        print(f'https://ipinfo.io/{df_asn.iloc[i,0]}')