# FCC Application Scraping

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

In [1]:
from bs4 import BeautifulSoup
import time

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

## Define functions for scraping table for an individual call sign

In [3]:
def element_exists(driver, xpath):
    """
    Helper function that checks if the element at the given xpath exists on
    driver's current page. This is used to check if there is a "Next" button
    -- to scrape through multiple pages of the FCC's paginated tables.
    """
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True


In [4]:
def scrape_table_page(innerHTML):
    """
    Extracts the following details from a call sign table page:
        * File Number
        * Call Sign/Lease ID
        * Applicant Name
        * FRN
        * Purpose - split into Purpose_Main and Purpose_Note
        * Radio
        * Service
        * Receipt
        * Date
        * Status
    Returns as a list of lists.
    """
    soup = BeautifulSoup(innerHTML, "lxml")
    table_page = soup.find("table", {"summary": "Application search results"})
    
    # run through each row
    rows = table_page.find_all("tr")[1:-1]
    table_data = []
    for row in rows:  # note: skip the first (header) and last (pagination) rows
        
        # run through each cell of each row
        cols = row.find_all("td")
        row_data = []
        for i in range(1, len(cols)): # note: skip the first (index) cell
            
            # extract cell data, trim leading/trailing whitespace
            cell_data = cols[i].get_text().strip()
            
            # fifth-index cells "Purpose" has to get split into two cols
            if i == 5:
                
                purpose = cell_data.split("\n")
                purpose_main = purpose[0].strip()
                purpose_note = ""
                
                # sometimes the "Purpose" column has an optional note
                if len(purpose) >= 2:
                    purpose_note = purpose[-1].strip()
                    
                row_data.append(purpose_main)
                row_data.append(purpose_note)
            
            # other cells can be inserted as normal
            else:
                row_data.append(cell_data)
                
        table_data.append(row_data)
        
    return(table_data)


In [5]:
def scrape_table_for_call_sign(call_sign, without_browser=False):
    """
    Scrapes the FCC Application Search database for a given call sign.
    
    Returns the table as a list of lists, for example:
    [
      ['File Number', 'Call Sign/Lease ID', 'Applicant Name', 'FRN', 'Purpose_Main', 'Purpose_Note', 'Radio Service', 'Receipt Date', 'Status'],
      ['0000214527',  'KNKN555',            'AT&T',           '',    'Amendment',    'Transfer Control', 'AL',        '09/05/2000',   'Granted'],
      ...,
      ...
    ]
    
    """
    tick = time.time()
    print("LOG[INFO]: START %s " % call_sign)
    
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")
    
    # spin up the webdriver
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")
    
    # specify the call sign that we want to scrape for
    callsign = firefox.find_element_by_id("ulsCallSign")
    callsign.send_keys(call_sign)
    
    # set pagination to 100 (maximum)
    pagination = Select(firefox.find_element_by_xpath("//select[@name='pageSize']"))
    pagination.select_by_visible_text("100")
    
    # submit the search form, wait for load
    # NOTE: find form submission by form element name, not search button image
    firefox.find_element_by_name("search").submit()
    WebDriverWait(firefox, 15).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
    WebDriverWait(firefox, 15).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
    
    table_data = [[]]
    
    # scrape the table on the first page
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    table_data += scrape_table_page(innerHTML)
    
    # as long there is a link to "next page"...
    next_page_xpath = "//*[@title='Next page of results']"
    while element_exists(firefox, next_page_xpath):
        # ... go to next page
        firefox.find_element_by_xpath(next_page_xpath).click()
        # ... wait for load
        WebDriverWait(firefox, 15).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
        # .. and scrape 
        innerHTML = firefox.execute_script("return document.body.innerHTML")
        table_data += scrape_table_page(innerHTML)

    # gracefully close the webdriver
    firefox.quit()
    
    tock = time.time()
    print("LOG[INFO]: END %s (%2f seconds)" % (call_sign, tock - tick))
    
    return table_data
     

## Scrape the tables for a list of call signs (loop)

In [6]:
# read in the list of FCC callsigns
call_signs = ["KNKN555", "KNKN556"]

In [7]:
# collects the scraped data for *all* the call signs into a python list of lists
scraped_data = []
for cs in call_signs:
    scraped_data.append(scrape_table_for_call_sign(cs, without_browser=False))

LOG[INFO]: START KNKN555 
LOG[INFO]: END KNKN555 (21.644037 seconds)
LOG[INFO]: START KNKN556 
LOG[INFO]: END KNKN556 (23.465402 seconds)


In [36]:
# sanity check over the data for size of scraped data, and dimensions of callsign tables
assert(len(scraped_data) == len(call_signs))
for table in scraped_data:
    for row in table:
        assert(len(row) == 9)

In [8]:
# last row of first scraped call-sign
for row in scraped_data[0][-1:]:
    print(row)

['60923CLMP99', 'KNKN555', 'WIRELESS ONE HOLDING COMPANY, L.P. CELLULAR ONE OF SOUTHWEST FLORIDA', '0001733369', 'New', '', 'CL', '01/01/1980', 'Granted']


In [9]:
# print first 10 rows of second scraped callsign
for row in scraped_data[1][:10]:
    print(row)

[]
['0000386266', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Transfer of Control', '', 'AL', '03/05/2001', 'Granted']
['0000388274', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '03/06/2001', 'Granted']
['0000547695', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Transfer of Control', '', 'AL', '08/02/2001', 'Granted']
['0000591187', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Renewal/Modification', '', 'CL', '09/13/2001', 'Granted']
['0000788563', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '02/27/2002', 'Granted']
['0001093516', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '11/18/2002', 'Granted']
['0001178597', '', 'Winston, LLC', '0003297520', 'Transfer of Control', '', 'AL', '01/29/2003', 'Granted']
['0001179154', '', 'Vanguard Cellular Pennsylvania, LLC', '0001944032', 'Assignme

# [TODO]Output scrapped data
* include readme in folder for column names
* save each table as callsign.csv
* note: empty value for call-sign --> part of larger transaction; see "Licenses" tab for list of all callsigns involved in transaction (only for newer applications)

# - Ignore below this line -

This is code that's probably worth keeping as as reference for now -- in case it gives us insight into solving any bugs we run into. But you can ignore it when you run the scraper.

### Read Me
Notes:
* find submit button using form element name, not submit image button
Reference links:
* http://jonathansoma.com/lede/foundations/classes/friday%20sessions/advanced-scraping-form-submissions-completed/

In [None]:
def scrape_table_for_call_sign(call_sign):
    
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    firefox = webdriver.Firefox(executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")
    
    # fill in the text input
    callsign = firefox.find_element_by_id("ulsCallSign")
    callsign.send_keys(call_sign)
    
    # set pagination to 100 (maximum)
    pagination = Select(firefox.find_element_by_xpath("//select[@name='pageSize']"))
    pagination.select_by_visible_text("10")
    
    # submit the search form, wait for load
    firefox.find_element_by_name("search").submit()
    WebDriverWait(firefox, 15).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
    WebDriverWait(firefox, 15).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
    
    #print(firefox.page_source.encode("utf-8"))
    
    #innerHTML = firefox.execute_script("return document.body.innerHTML")
    #print(innerHTML)
    
    # find the link the next page if it exists
    firefox.find_element_by_xpath("//*[@title='Next page of results']").click()

    #next_link = firefox.find_element_by_xpath("//*[@title=\"Next page of results\"][0]")
    #next_link = firefox.find_element_by_xpath("//a[@title='Next page of results']")
    #print(next_link)
    
    #page_links = firefox.find_element_by_xpath("//a[@href]")
    #print(page_links)
    
    # count the number of pages we need to iterate through
    #pages = firefox.find_element_by_xpath("//*[contains(@title, 'Page')]")
    #pages = firefox.find_element_by_css_selector("//[title*='Page']")
    #print(pages)
    
    # gracefully close the webdriver
    time.sleep(10)
    firefox.quit()

In [None]:
scrape_table_for_call_sign("KNKN555")

In [None]:
firefox = webdriver.Firefox(executable_path="./geckodriver")
firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")

# fill in the text input
callsign = firefox.find_element_by_id("ulsCallSign")
callsign.send_keys("KNKN555")

# fill in the dropdown
#dropdown= firefox.find_element_by_id("radioServiceCode")
#dropdown.send_keys("CL - Cellular")

# pagination to max
display = firefox.find_element_by_id("ulsRowsPerPage")
display.send_keys(100)

# submit form
firefox.find_element_by_name("search").submit()
WebDriverWait(firefox, 15).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
#test = WebDriverWait(firefox, 3).until(EC.staleness_of((By.ID, 'ulsCallSign')))
#test = firefox.implicitly_wait(10);

print(firefox.execute_script("return document.readyState"))

WebDriverWait(firefox, 15).until( lambda _: firefox.execute_script("return document.readyState") == "complete")
print(firefox.current_url)

# scrape out the table
#print(firefox.page_source.encode("utf-8"))
innerHTML = firefox.execute_script("return document.body.innerHTML")
print(innerHTML)

time.sleep(30)
firefox.quit()

In [None]:
ffx = Firefox(executable_path="./geckodriver")
request_data = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": "24",
    "y": "4",
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}
ffx.request("POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=request_data)

In [None]:
options = Options()  
#options.add_argument("--headless")  
chrome = webdriver.Chrome("./chromedriver")
chrome.get("www.google.com.au")
#chrome.request("GET", "https://www.google.com.au")
#chrome.request("GET", "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAppl.jsp")

In [None]:
referrer = "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAppl.jsp"
headers = { "referrer" : referrer }
payload = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": 24,
    "y": 4,
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}

res = requests.post(
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=payload,
    headers=headers,
)

In [None]:
res.history[1].url

In [None]:
options = Options()  
#options.add_argument("--headless")  
chrome = Chrome("./chromedriver", options=options)
chrome.request("GET", "https://www.fcc.gov/")

In [None]:
chrome.request("GET", "https://www.fcc.gov/")

In [None]:
request_data = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": 24,
    "y": 4,
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}

res = chrome.request(
    "POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=request_data
)

In [None]:
chrome = Chrome("./chromedriver")
response = chrome.request(
    "POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data={
        "basicSearchType": "basicSearchCallSign",
        "basicSearchTerm": "KNKN555",
        "x": 24,
        "y": 4,
        "exactMatchInd": "Y",
        "jsValidated" : "true",
        "currentPage": "searchAppl.jsp",
        "actionElement": "actionApplicationSearch"
    }
)

In [None]:
response = chrome.request(
    "GET",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/results.jsp?applSearchKey=applSearchKey2019321143892"
)

In [None]:
res = requests.post(
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data={
        "basicSearchType": "basicSearchCallSign",
        "basicSearchTerm": "KNKN555",
        "x": "24",
        "y": "4",
        "exactMatchInd": "Y",
        "jsValidated" : "true",
        "currentPage": "searchAppl.jsp",
        "actionElement": "actionApplicationSearch"
    }
)