# FCC Application Scraping

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

In [1]:
from bs4 import BeautifulSoup
import time
import random
import csv

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

In [3]:
import os
import pandas as pd

## Define functions for scraping table for an individual call sign

In [51]:
def element_exists(driver, xpath):
    """
    Helper function that checks if the element at the given xpath exists on
    driver's current page. This is used to check if there is a "Next" button
    -- to scrape through multiple pages of the FCC's paginated tables.
    """
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True


In [115]:
def extract_callsign(innerHTML):
    '''
    Helper function that extracts the callsign from Application search results page.
    This is used to append the CALLSIGN to each row when scraping table data
    '''
    soup = BeautifulSoup(innerHTML, "lxml")
    
    # tag attributes for element containing text "Call Sign = " 
    tag_attrs = {"class":"cell-pri-light", "valign":"bottom", "align":"left"}
    
    # extract callsign as text
    callsign = (
        soup.find_all(attrs=tag_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return callsign

In [116]:
def count_results(innerHTML):
    '''
    Helper function that extracts the total number of results across all pages.
    This is used to check the scraped table data has the expected number of rows.
    '''
    
    soup = BeautifulSoup(innerHTML, "lxml")
    
    # tag attributes for element containing text "Matches 1-47 (of 47)"
    tag_attrs = {"class":"cell-pri-dark", "width":"35%", "valign":"middle", "align":"left"}

    # extract results count from inside "(of 47)" as integer 47
    count = int(soup.find_all(attrs=tag_attrs, limit=1)[0]
                .find_all("b")[1]
                .get_text()
               )
    
    return count

In [117]:
def scrape_table_page(innerHTML):
    """
    Extracts the following details from a call sign table page:
        * File Number
        * Call Sign/Lease ID
        * Applicant Name
        * FRN
        * Purpose - split into Purpose_Main and Purpose_Note
        * Radio
        * Service
        * Receipt
        * Date
        * Status
    Returns as a list of lists.
    """
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")

    CALLSIGN = extract_callsign(innerHTML)
    table_page = soup.find("table", {"summary": "Application search results"})
    
    # Extract search results as list of rows
    rows = table_page.find_all("tr")[1:-1] # note: skip header and pagination rows
    
    # Extract results by row into table, attaching callsign number
    table_data = []
    
    for row in rows:  
        
        row_data = [CALLSIGN, ]
        cols = row.find_all("td")
        
        # extract values from each cell in row
        for i in range(1, len(cols)): # note: skip the first (index) cell
            
            # extract cell data, trim leading/trailing whitespace
            cell_data = cols[i].get_text().strip()
            
            # fifth-index cells "Purpose" has to get split into two cols
            if i == 5:
                
                purpose = cell_data.split("\n")
                purpose_main = purpose[0].strip()
                purpose_note = ""
                
                # sometimes the "Purpose" column has an optional note
                if len(purpose) >= 2:
                    purpose_note = purpose[-1].strip()
                    
                row_data.append(purpose_main)
                row_data.append(purpose_note)
            
            # other cells can be inserted as normal
            else:
                row_data.append(cell_data)
                
        table_data.append(row_data)
    
    return(table_data)


In [123]:
def cache_to_csv(lst, path):

    no_rows = len(lst)
    
    with open(path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(lst)

    with open(path, 'r', newline='') as f:
        reader = csv.reader(f)
        check = list(reader)

    assert(len(lst) == len(check))
    
    print('LOG[INFO]: Wrote %s rows to %s' % (no_rows, path))

In [134]:
def make_html_path(parent_dir, call_sign, page_no):
    '''
    Helper function that generates path for caching html pages
    '''
    path = ("%s/%s-%s.html" % (parent_dir, call_sign, str(page_no).zfill(3)))
    return path

In [133]:
make_html_path('cake', 'is', '4')

'cake/is-004.html'

In [157]:
def scrape_tables_for_call_sign(call_sign, html_cache, csv_cache, without_browser=False):
    """
    Scrapes the FCC Application Search database for a given call sign.
    
    Caches results pages as HTML
    
    Returns the table as a list of lists, for example:
    [
      ['File Number', 'Call Sign/Lease ID', 'Applicant Name', 'FRN', 'Purpose_Main', 'Purpose_Note', 'Radio Service', 'Receipt Date', 'Status'],
      ['0000214527',  'KNKN555',            'AT&T',           '',    'Amendment',    'Transfer Control', 'AL',        '09/05/2000',   'Granted'],
      ...,
      ...
    ]
    
    """
    tick = time.time()
    print("LOG[INFO]: START %s " % call_sign)
    
    
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")
    
    # spin up the webdriver
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")
    
    # specify the call sign that we want to scrape for
    callsign = firefox.find_element_by_id("ulsCallSign")
    callsign.send_keys(call_sign)
    
    # set pagination to 100 (maximum)
    pagination = Select(firefox.find_element_by_xpath("//select[@name='pageSize']"))
    pagination.select_by_visible_text("100")
    
    # submit the search form, wait for load
    # NOTE: find form submission by form element name, not search button image
    firefox.find_element_by_name("search").submit()
    WebDriverWait(firefox, 45).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
    WebDriverWait(firefox, 45).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
    
    # extract and cache html from first page
    page = 1
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    with open(make_html_path(html_cache, call_sign, page), 'w') as f:
        f.write(innerHTML)
    
    # store count of search results for later check
    no_results = count_results(innerHTML)
    
    # Scrape results data from first page
    search_data = []
    search_data += scrape_table_page(innerHTML)
    
    # as long there is a link to "next page"...
    next_page_xpath = "//*[@title='Next page of results']"
    while element_exists(firefox, next_page_xpath):
        # ... go to next page
        firefox.find_element_by_xpath(next_page_xpath).click()
        # ... wait for load
        WebDriverWait(firefox, 45).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
        # .. increment page counter, extract and cache html 
        page += 1
        innerHTML = firefox.execute_script("return document.body.innerHTML")
        with open(make_html_path(html_cache, call_sign, page), 'w') as f:
            f.write(innerHTML)
        # ... scrape and add page results to all results
        search_data += scrape_table_page(innerHTML)

    # gracefully close the webdriver
    firefox.quit()
    
    # Check that all results have been scraped
    no_rows = len(search_data)
    #assert(no_rows == no_results)
    print("LOG[INFO]: Scraped %s rows out of %s results" % (no_rows, no_results))

    # Cache search results table to csv (all results from all pages)
    csv_path = ''.join([csv_cache, call_sign, ".csv"])
    cache_to_csv(search_data, csv_path)
    

    tock = time.time()
    print("LOG[INFO]: END %s (%2f seconds)" % (call_sign, tock - tick))
    
    return search_data
     

## Scrape the tables for a list of call signs (loop)

In [151]:
# read in list of all callsigns from file

callsign_path = 'data-cache/cma_callsign_lookup.csv'

callsigns = pd.read_csv(callsign_path).sort_values(by=['callsign_leaseID'])['callsign_leaseID'].tolist()



['B',
 'KNKA200',
 'KNKA201',
 'KNKA202',
 'KNKA203',
 'KNKA204',
 'KNKA205',
 'KNKA206',
 'KNKA207',
 'KNKA208',
 'KNKA209',
 'KNKA210',
 'KNKA211',
 'KNKA212',
 'KNKA214',
 'KNKA215',
 'KNKA216',
 'KNKA217',
 'KNKA218',
 'KNKA219',
 'KNKA220',
 'KNKA221',
 'KNKA222',
 'KNKA223',
 'KNKA224',
 'KNKA225',
 'KNKA226',
 'KNKA227',
 'KNKA228',
 'KNKA229',
 'KNKA230',
 'KNKA231',
 'KNKA232',
 'KNKA233',
 'KNKA234',
 'KNKA235',
 'KNKA236',
 'KNKA237',
 'KNKA238',
 'KNKA239',
 'KNKA240',
 'KNKA241',
 'KNKA242',
 'KNKA243',
 'KNKA244',
 'KNKA245',
 'KNKA246',
 'KNKA247',
 'KNKA248',
 'KNKA249',
 'KNKA250',
 'KNKA251',
 'KNKA252',
 'KNKA253',
 'KNKA254',
 'KNKA255',
 'KNKA256',
 'KNKA257',
 'KNKA258',
 'KNKA259',
 'KNKA260',
 'KNKA261',
 'KNKA262',
 'KNKA263',
 'KNKA264',
 'KNKA265',
 'KNKA266',
 'KNKA267',
 'KNKA268',
 'KNKA269',
 'KNKA270',
 'KNKA271',
 'KNKA272',
 'KNKA273',
 'KNKA274',
 'KNKA275',
 'KNKA276',
 'KNKA277',
 'KNKA278',
 'KNKA279',
 'KNKA280',
 'KNKA281',
 'KNKA282',
 'KNKA283'

In [159]:
html_cache = 'search-cache/apps-by-callsign/html'
csv_cache = 'search-cache/apps-by-callsign/csv/'

tick = time.time()
print("LOG[INFO]: START TIME is %s\n" % (time.strftime("%I:%M %p")))

tries = 0
max_tries = 5

start_index = 1

for cs in callsigns[start_index:2]:    
    if tries <= max_tries:
        try:
            scrape_tables_for_call_sign(cs, html_cache, csv_cache, without_browser=False)
            pause = (random.randint(1,30))
        except Exception as e:
            tries += 1
            print("Error at %s, located at callsigns[%s]" % (cs, callsigns.index(cs)))
            pause = (random.randint(200,300) * math.sqrt(tries))
        finally:
            time.sleep(pause)
    else:
        break

tock = time.time()
print("LOG[INFO]: END")

LOG[INFO]: START TIME is 03:34 PM

LOG[INFO]: START KNKA200 
LOG[INFO]: Scraped 266 rows out of 266 results
Wrote 266 rows to path: search-cache/apps-by-callsign/csv/KNKA200.csv
LOG[INFO]: END KNKA200 (15.721977 seconds)
LOG[INFO]: END
