In [1]:
%load_ext pycodestyle_magic

In [2]:
%flake8_on

In [1]:
"""Phishtank scraper.

Visits phishtank to collect phishing links
based on id number in reverse order,
then visits phishing links to
collect screenshots and html source code.
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
from datetime import date
from pathlib import Path
import requests
import os
import time
import zlib
import json

def check_if_phish(ID, is_valid=False, is_online=False):
    """Retrieve phishing link from phishtank.

    Parameters
    ----------
    ID : int
        Phish's id on phishtank
    is_valid : bool
        Optional check for phishtank's 'valid' classification
    is_online : bool
        Optional check for phishtank's 'online' classification

    Returns
    -------
    link : string
        Phishing webpage URL
    to_catch : bool
        Result of optional checks

    """
    # Catch all phishes by default
    to_catch = True

    # Go to the phishtank entry using ID
    driver.get("https://phishtank.org/phish_detail.php?phish_id=" + str(ID))

    # Check if phishtank lists phish as valid
    if is_valid:
        valid = driver.find_element(By.XPATH,
                                    '//div[@id="history"]/table/tbody\
                                    /tr/td/h3').text
        # Filter for only valid phishes
        if valid != 'Verified: Is a phish':
            to_catch = False

    # Check if phishtank lists phish as online
    if is_online:
        online = driver.find_element(By.XPATH,
                                     '//div[@id="widecol"]/div/h2/span').text
        # Filter for only online phishes
        if online != 'is currently ONLINE':
            to_catch = False
    """        
    # User Micha sometimes uploads hundreds of fake 404 links
    # Remove multi-line string when encountered
    # Check if user is Micha
    micha = driver.find_element(By.XPATH,
                                '//div[@class="url"]/span/b/a').text
    # Filter for not Micha
    if micha == 'Micha':
        to_catch = False
    """
    # Retrieve phishing link
    link = driver.find_element(
        By.XPATH, '//span[@style="word-wrap:break-word;"]/b'
    ).text

    return link, to_catch


def check_status_code(link):
    """Check for live URL.

    Parameters
    ----------
    link : string
        URL of webpage

    Returns
    -------
    is_live : bool
        True if link's status code == 200

    """
    # Assume link is dead
    is_live = False

    try:
        r = requests.get(link)

    # Handle non-existent domains
    except Exception:
        return is_live

    # If domain exists, check status code
    else:
        if r.status_code == 200:
            is_live = True
            return is_live
        else:
            return is_live


def collect_data(link, screenshot=True, source_code=True, way_back=False):
    """Collects data from phishing URL.

    Collected screenshot and source code are stored in a folder
    named after the link with 'https:' and '/' removed.
    'http:' left in to differentiate from 'https:'.

    Parameters
    ----------
    link : string
        URL to visit
    screenshot : bool
        Webpage screenshot
    source_code : bool
        Webpage html source code
    way_back : bool
        Attempt to store webpage in way back machine
        
    Returns
    -------
    html : string
        html source code
    save_loc : string
        data storage directory

    """
    # Create folder to store entry in collection
    parent = f"collection/{str(date.today())}/"
    directory = link.replace("/", "")
    directory = directory.replace("https:", "")
    # Make sure folder name is not too long
    if len(directory) > 128:
        directory = directory[:129]
    path = os.path.join(parent, directory)
    try:
        os.mkdir(path)
    except Exception:
        pass

    # Directory to save data
    save_loc = f"{str(Path.cwd())}/collection/{str(date.today())}/{directory}/"
    
    # Webpage screenshot
    if screenshot:
        driver.get_screenshot_as_file(
            f"{save_loc}{directory}.png"
        )

    # Stores html source code
    if source_code:
        html = driver.page_source
        f = open(f"{save_loc}{directory}.html", "w")
        f.write(html)
        f.close()

    # Uploads page to Wayback machine
    if way_back:
        driver.get("https://archive.org/web/")
        driver.find_element(
            By.XPATH,
            '//form[@name="wwwform_save"]/input'
        ).send_keys(link)
        try:
            driver.find_element(
                By.XPATH,
                '//button[@class="web-save-button web_button web_text"]'
            ).click()
        except Exception:
            pass
        try:
            driver.find_element(
                By.XPATH,
                '//form/input[@value="SAVE PAGE"]'
            ).click()
        except Exception:
            pass

    # Stores directory and URL in text file
    f = open("collected_phishes.csv", "a")
    f.write("\n" + '"' + str(date.today())+"/"+directory + '"' + "," + '"' + link + '"')
    f.close()
    
    return html, save_loc

def log_entry(link, collected, checksum="0"):
    """Record collection attempt in a text file.

    Parameters
    ----------
    link : string
        Phishing URL
    collected : bin
        1 for successful collection, 0 otherwise
    checksum : string
        checksum for successful collection, 0 otherwise

    """
    f = open("collection_attempts.csv", "a")
    end = time.time()
    time_elapsed = round((end - start), 4)
    f.write(
        "\n"
        + str(i)
        + ","
        + '"'
        + link
        + '"'
        + ","
        + str(collected)
        + ","
        + str(time_elapsed)
        + ","
        + checksum
    )
    f.close()

def compare_source(link, checksum_dict):
    """Compares new html with collected html
    
    Visits link and collects html source code.
    Maintains dictionary of collected html,
    key: value pairs are adler32 checksum: count.
    
    Parameters
    ----------
    link : string
        URL to visit
    checksum_dict : dict
        Dictionary of checksums
    
    Returns
    -------
    checksum : string
        adler32 checksum of html source code
    is_new : bool
        True if checksum did not exist in checksum_dict
    
    """
    #Visit URL
    driver.get(link)
    
    #Checks html scource code against collected
    html = str.encode(driver.page_source)
    checksum = str(zlib.adler32(html))
    
    #Increase checksum count
    if checksum in checksum_dict:
        checksum_dict[checksum] += 1
        is_new = False
    else:
        checksum_dict[checksum] = 1
        is_new = True
        
    return checksum, is_new

def iframe_source(html, parent):
    """Gets html from possible iframe sources
    
    Parameters
    ----------
    html : string
        web page source code
    parent : string
        parent directory to store html from iframe
    
    """
    soup = BeautifulSoup(html, "html.parser")
    if soup.find("iframe") is not None:
        links_list = [tag.get("src") for tag in soup.find_all("iframe")]
        for i in links_list:
            try:
                # Handle blobs
                if i[:5] == "blob:":
                    j = i[5:]
                else:
                    j = i
                # Visit iframe source
                driver.get(j)
                # Specify directory to store html
                idirectory = i.replace("/", "")
                idirectory = idirectory.replace("https:", "")
                # Get html source code and save it
                html = driver.page_source
                f = open(f"{parent}{idirectory}.html", "w")
                f.write(html)
                f.close()
                # Get screenshot
                driver.get_screenshot_as_file(f"{parent}{idirectory}.png")
                # Look for nested iframes and repeat
                iframe_source(html, parent)
            except Exception:
                continue

# Actual stuff starts here

In [2]:
# Import firefox profile and allow firefox to access phishing webpages.
options = Options()
options.add_argument("-profile")
options.add_argument(
    "/home/defaultuser/snap/firefox/common" +
    "/.mozilla/firefox/selenium_profile.default"
)
options.set_capability("acceptInsecureCerts", True)

In [14]:
# Start selenium
driver = webdriver.Firefox(options=options)

In [4]:
# Create directories and text files to store data
if not os.path.exists('collected_phishes.csv'):
    with open('collected_phishes.csv', 'w') as f:
        f.write('directory,url')
if not os.path.exists('collection_attempts.csv'):
    with open('collection_attempts.csv', 'w') as f:
        f.write('id,url,collected?,time_taken(s),checksum')
if not os.path.exists('checksum_dict.json'):
    with open('checksum_dict.json','w') as f:
        f.write("{}")

In [16]:
start_total = time.time()

# Read existing html checksums
with open('checksum_dict.json','r') as f:
    checksum_dict = json.load(f)

# Insert start ID as i
i = 7657473

# Insert end ID as i != end ID
while i != 7657373:

    # Start time for single id
    start = time.time()
    
    # Create new date directory if running overnight
    Path(f'collection/{date.today()}').mkdir(parents=True,exist_ok=True)
    
    # Check if phish
    try:
        link, to_catch = check_if_phish(i)
    except Exception:
        continue

    # Filter for 'valid' or 'online' if they are set.
    if not to_catch:
        log_entry(link, 0)

    else:
        # Check if link is dead
        is_live = check_status_code(link)
        if not is_live:
            log_entry(link, 0)

        else:
            # Check if html has been collected
            try:
                checksum, is_new = compare_source(link, checksum_dict)
            except Exception:
                log_entry(link, 0)
            
            if is_new:
                # Collect the phish if html is new
                try:
                    html, save_loc = collect_data(link)
                    log_entry(link, 1, checksum=checksum)
                except Exception:
                    log_entry(link, 0, checksum=checksum)
                # Collect from possible iframe sources
                iframe_source(html, save_loc)
                
            else:
                log_entry(link, 0, checksum=checksum)

    i -= 1

# Calculate total time
end_total = time.time()
total_time = end_total - start_total
total_time

743.2525608539581

In [8]:
# Delete old dictionary file and write a new one
os.remove('checksum_dict.json')
with open('checksum_dict.json','w') as f:
    json.dump(checksum_dict,f)

In [None]:
#Started 2022-08-16 at ID 7657582