# HTTP2 List Crawler

This script uses a CSV input of URLs to render one and a time and extract HTTP2 status. As it runs, it incrementally outputs the data into an output CSV you define. You only need to define your input and output CSV paths, and your Chromedriver location. You can, optionally, change the user agent, where defined below. All needed documentation is in the comments.

### Configuration
This first cell is the the only cell you should need to make changes to.
Be sure the read all commented notes in this first cell.

In [None]:
# As a fail safe, the script saves X number of rows of urls at a time to the output file, designate the amount of rows here. 
## You can make this slightly higher for CSVs over 10k URLs, but this script is untested above that many URLs.
rows_per_run = 10

## If you'd like to loop through a CSV of URLs with URL in the 'url' column, paste that path here.
## !IMPORTANT! As part of the fail safe described above, the script removes rows_per_run rows of urls at a time from this source file, so be sure to have a backup of this file before you run this script on it. 
url_source = '/Users/jsciortino/Desktop/url-input.csv'

## Designate the path where you'd like the output of results. The script will create the file for you.
url_output = '/Users/jsciortino/Desktop/output.csv'

## Designate the local path of your Chromedriver. If you need to install: https://chromedriver.chromium.org/downloads
## On Mac, the Chromedriver path may not have a file extension. On Windows it will likely have an .exe file extension.
chrome_path = '/Users/jsciortino/py/chromedriver-85'

In [None]:
## Imports the necessary libraries
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup as soup

# HTTP2 Function libs
import socket
import ssl
import csv
import argparse
from urllib.parse import urlparse

In [None]:
## This creates the blank output file as the url_output path designated above. No changes needed.
df_output = pd.DataFrame(columns = ['url', 'http2'])
df_output.to_csv(url_output, index=False)

### Program
The program will loop through each URL, extracting Mixed Content errors from the Chrome Console Log (webdriver.Chrome.get_log).<br>
It renders in full Chrome, including JavaScript, a rate of about 1-5 seconds per URL depending on host server speed and your internet connection.<br>
It has not been tested on a list larger than 10k URLs.
##### No changes are needed below.

In [None]:
## Enables browser logging & sets options
## No further changes are required

d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'browser':'ALL' }

opt = webdriver.ChromeOptions()
opt.add_experimental_option('w3c', False)

## HTTP2 config, including User Agent if you choose to modify
socket.setdefaulttimeout(5)
headers = {"user-agent" : "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z‡ Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}

In [None]:
df_source = pd.read_csv(url_source)

while len(df_source) > 0:
    new_rows = df_source.iloc[ 0: rows_per_run, : ]
    print(str(len(new_rows)) + ' rows to process...')
    url_list = new_rows['url'].tolist()
    
    page_output_df = pd.DataFrame()
    
    d = DesiredCapabilities.CHROME
    d['loggingPrefs'] = { 'browser':'ALL' }
    opt = webdriver.ChromeOptions()
    opt.add_experimental_option('w3c', False)

    for url in url_list:
        driver = webdriver.Chrome(chrome_path, options=opt,desired_capabilities=d)
        
        try:
            driver.get(url)
                
            try:
                HOST = urlparse(url).netloc
                PORT = 443

                ctx = ssl.create_default_context()
                ctx.set_alpn_protocols(['h2', 'spdy/3', 'http/1.1'])

                conn = ctx.wrap_socket(
                    socket.socket(socket.AF_INET, socket.SOCK_STREAM), server_hostname=HOST)
                conn.connect((HOST, PORT))

                pp = conn.selected_alpn_protocol()

                if pp == "h2":
                    http2 = True
                else:
                    http2 = False

            except Exception as e:
                print(e)
                
            
            page_results = {'http2':http2, 'loaded':True}
            page_row_df = pd.DataFrame(data=page_results, index=[0])
            page_row_df['url'] = url
            page_output_df = page_output_df.append(page_row_df, ignore_index=True, sort=False)

            # Quit browser each time to avoid zombies
            driver.quit()
    
        ## A failsafe to prevent URLs that won't load from blocking script from continuing
        except:
            page_results = {'http2':http2, 'loaded':False}
            page_row_df['url'] = url
            page_output_df = page_output_df.append(page_row_df, ignore_index=True, sort=False)
            driver.quit()
            print("Skipping 1 URL that failed to render.")

        
    # Read the output CSV, write the new rows, then write the output back again
    df_output = pd.read_csv(url_output)
    df_output = df_output.append(page_output_df, ignore_index=True, sort=False)
    df_output.to_csv(url_output, index=False)
    
    # If all the URLs were processed, write the source list back without the processed URLs
    updated_df = df_source.iloc[ rows_per_run+1: , : ]
    updated_df.to_csv(url_source, index=False)
    df_source = pd.read_csv(url_source)
    
driver.quit()

print("Finished!")