# FCC Application Scraping

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

In [1]:
from bs4 import BeautifulSoup
import time
import random
import csv

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

In [3]:
import os
import pandas as pd

## Define functions for scraping table for an individual call sign

In [51]:
def element_exists(driver, xpath):
    """
    Helper function that checks if the element at the given xpath exists on
    driver's current page. This is used to check if there is a "Next" button
    -- to scrape through multiple pages of the FCC's paginated tables.
    """
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True


In [115]:
def extract_callsign(innerHTML):
    '''
    Helper function that extracts the callsign from Application search results page.
    This is used to append the CALLSIGN to each row when scraping table data
    '''
    soup = BeautifulSoup(innerHTML, "lxml")
    
    # tag attributes for element containing text "Call Sign = " 
    tag_attrs = {"class":"cell-pri-light", "valign":"bottom", "align":"left"}
    
    # extract callsign as text
    callsign = (
        soup.find_all(attrs=tag_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return callsign

In [116]:
def count_results(innerHTML):
    '''
    Helper function that extracts the total number of results across all pages.
    This is used to check the scraped table data has the expected number of rows.
    '''
    
    soup = BeautifulSoup(innerHTML, "lxml")
    
    # tag attributes for element containing text "Matches 1-47 (of 47)"
    tag_attrs = {"class":"cell-pri-dark", "width":"35%", "valign":"middle", "align":"left"}

    # extract results count from inside "(of 47)" as integer 47
    count = int(soup.find_all(attrs=tag_attrs, limit=1)[0]
                .find_all("b")[1]
                .get_text()
               )
    
    return count

In [117]:
def scrape_table_page(innerHTML):
    """
    Extracts the following details from a call sign table page:
        * File Number
        * Call Sign/Lease ID
        * Applicant Name
        * FRN
        * Purpose - split into Purpose_Main and Purpose_Note
        * Radio
        * Service
        * Receipt
        * Date
        * Status
    Returns as a list of lists.
    """
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")

    CALLSIGN = extract_callsign(innerHTML)
    table_page = soup.find("table", {"summary": "Application search results"})
    
    # Extract search results as list of rows
    rows = table_page.find_all("tr")[1:-1] # note: skip header and pagination rows
    
    # Extract results by row into table, attaching callsign number
    table_data = []
    
    for row in rows:  
        
        row_data = [CALLSIGN, ]
        cols = row.find_all("td")
        
        # extract values from each cell in row
        for i in range(1, len(cols)): # note: skip the first (index) cell
            
            # extract cell data, trim leading/trailing whitespace
            cell_data = cols[i].get_text().strip()
            
            # fifth-index cells "Purpose" has to get split into two cols
            if i == 5:
                
                purpose = cell_data.split("\n")
                purpose_main = purpose[0].strip()
                purpose_note = ""
                
                # sometimes the "Purpose" column has an optional note
                if len(purpose) >= 2:
                    purpose_note = purpose[-1].strip()
                    
                row_data.append(purpose_main)
                row_data.append(purpose_note)
            
            # other cells can be inserted as normal
            else:
                row_data.append(cell_data)
                
        table_data.append(row_data)
    
    return(table_data)


In [123]:
def cache_to_csv(lst, path):

    no_rows = len(lst)
    
    with open(path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(lst)

    with open(path, 'r', newline='') as f:
        reader = csv.reader(f)
        check = list(reader)

    assert(len(lst) == len(check))
    
    print('LOG[INFO]: Wrote %s rows to %s' % (no_rows, path))

In [134]:
def make_html_path(parent_dir, call_sign, page_no):
    '''
    Helper function that generates path for caching html pages
    '''
    path = ("%s/%s-%s.html" % (parent_dir, call_sign, str(page_no).zfill(3)))
    return path

In [133]:
make_html_path('cake', 'is', '4')

'cake/is-004.html'

In [157]:
def scrape_tables_for_call_sign(call_sign, html_cache, csv_cache, without_browser=False):
    """
    Scrapes the FCC Application Search database for a given call sign.
    
    Caches results pages as HTML
    
    Returns the table as a list of lists, for example:
    [
      ['File Number', 'Call Sign/Lease ID', 'Applicant Name', 'FRN', 'Purpose_Main', 'Purpose_Note', 'Radio Service', 'Receipt Date', 'Status'],
      ['0000214527',  'KNKN555',            'AT&T',           '',    'Amendment',    'Transfer Control', 'AL',        '09/05/2000',   'Granted'],
      ...,
      ...
    ]
    
    """
    tick = time.time()
    print("LOG[INFO]: START %s " % call_sign)
    
    
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")
    
    # spin up the webdriver
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")
    
    # specify the call sign that we want to scrape for
    callsign = firefox.find_element_by_id("ulsCallSign")
    callsign.send_keys(call_sign)
    
    # set pagination to 100 (maximum)
    pagination = Select(firefox.find_element_by_xpath("//select[@name='pageSize']"))
    pagination.select_by_visible_text("100")
    
    # submit the search form, wait for load
    # NOTE: find form submission by form element name, not search button image
    firefox.find_element_by_name("search").submit()
    WebDriverWait(firefox, 45).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
    WebDriverWait(firefox, 45).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
    
    # extract and cache html from first page
    page = 1
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    with open(make_html_path(html_cache, call_sign, page), 'w') as f:
        f.write(innerHTML)
    
    # store count of search results for later check
    no_results = count_results(innerHTML)
    
    # Scrape results data from first page
    search_data = []
    search_data += scrape_table_page(innerHTML)
    
    # as long there is a link to "next page"...
    next_page_xpath = "//*[@title='Next page of results']"
    while element_exists(firefox, next_page_xpath):
        # ... go to next page
        firefox.find_element_by_xpath(next_page_xpath).click()
        # ... wait for load
        WebDriverWait(firefox, 45).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
        # .. increment page counter, extract and cache html 
        page += 1
        innerHTML = firefox.execute_script("return document.body.innerHTML")
        with open(make_html_path(html_cache, call_sign, page), 'w') as f:
            f.write(innerHTML)
        # ... scrape and add page results to all results
        search_data += scrape_table_page(innerHTML)

    # gracefully close the webdriver
    firefox.quit()
    
    # Check that all results have been scraped
    no_rows = len(search_data)
    #assert(no_rows == no_results)
    print("LOG[INFO]: Scraped %s rows out of %s results" % (no_rows, no_results))

    # Cache search results table to csv (all results from all pages)
    csv_path = ''.join([csv_cache, call_sign, ".csv"])
    cache_to_csv(search_data, csv_path)
    

    tock = time.time()
    print("LOG[INFO]: END %s (%2f seconds)" % (call_sign, tock - tick))
    
    return search_data
     

## Scrape the tables for a list of call signs (loop)

In [151]:
# read in list of all callsigns from file

callsign_path = 'data-cache/cma_callsign_lookup.csv'

callsigns = pd.read_csv(callsign_path).sort_values(by=['callsign_leaseID'])['callsign_leaseID'].tolist()



['B',
 'KNKA200',
 'KNKA201',
 'KNKA202',
 'KNKA203',
 'KNKA204',
 'KNKA205',
 'KNKA206',
 'KNKA207',
 'KNKA208',
 'KNKA209',
 'KNKA210',
 'KNKA211',
 'KNKA212',
 'KNKA214',
 'KNKA215',
 'KNKA216',
 'KNKA217',
 'KNKA218',
 'KNKA219',
 'KNKA220',
 'KNKA221',
 'KNKA222',
 'KNKA223',
 'KNKA224',
 'KNKA225',
 'KNKA226',
 'KNKA227',
 'KNKA228',
 'KNKA229',
 'KNKA230',
 'KNKA231',
 'KNKA232',
 'KNKA233',
 'KNKA234',
 'KNKA235',
 'KNKA236',
 'KNKA237',
 'KNKA238',
 'KNKA239',
 'KNKA240',
 'KNKA241',
 'KNKA242',
 'KNKA243',
 'KNKA244',
 'KNKA245',
 'KNKA246',
 'KNKA247',
 'KNKA248',
 'KNKA249',
 'KNKA250',
 'KNKA251',
 'KNKA252',
 'KNKA253',
 'KNKA254',
 'KNKA255',
 'KNKA256',
 'KNKA257',
 'KNKA258',
 'KNKA259',
 'KNKA260',
 'KNKA261',
 'KNKA262',
 'KNKA263',
 'KNKA264',
 'KNKA265',
 'KNKA266',
 'KNKA267',
 'KNKA268',
 'KNKA269',
 'KNKA270',
 'KNKA271',
 'KNKA272',
 'KNKA273',
 'KNKA274',
 'KNKA275',
 'KNKA276',
 'KNKA277',
 'KNKA278',
 'KNKA279',
 'KNKA280',
 'KNKA281',
 'KNKA282',
 'KNKA283'

In [159]:
tick = time.time()
print("LOG[INFO]: START TIME is %s\n" % (time.strftime("%I:%M %p")))

tries = 0
max_tries = 5

start_index = 1

for cs in callsigns[start_index:2]:    
    if tries <= max_tries:
        try:
            scrape_tables_for_call_sign(cs, html_cache, csv_cache, without_browser=False)
            pause = (random.randint(1,30))
        except Exception as e:
            tries += 1
            print("Error at %s, located at callsigns[%s]" % (cs, callsigns.index(cs)))
            pause = (random.randint(200,300) * math.sqrt(tries))
        finally:
            time.sleep(pause)
    else:
        break

tock = time.time()
print("LOG[INFO]: END")

LOG[INFO]: START TIME is 03:34 PM

LOG[INFO]: START KNKA200 
LOG[INFO]: Scraped 266 rows out of 266 results
Wrote 266 rows to path: search-cache/apps-by-callsign/csv/KNKA200.csv
LOG[INFO]: END KNKA200 (15.721977 seconds)
LOG[INFO]: END


In [138]:
# read in the list of FCC callsigns
call_signs = ["KNKN555", "KNKN556"]
call_signs = ["KNKA456"]
call_signs = ["KNKA455", "KNKA454"]
call_signs = ["KNKN776", "KNKN775"]
html_cache = 'search-cache/apps-by-callsign/html'
csv_cache = 'search-cache/apps-by-callsign/csv/'

In [139]:
# collects the scraped data for *all* the call signs into a python list of lists
scraped_data = []
for cs in call_signs:
    scraped_data.append(
        scrape_tables_for_call_sign(cs, html_cache, csv_cache, without_browser=False)
    )

LOG[INFO]: START KNKN776 
LOG[INFO]: Scraped 42 rows out of 42 results
Wrote 42 rows to path: search-cache/apps-by-callsign/csv/KNKN776.csv
LOG[INFO]: END KNKN776 (15.651379 seconds)
LOG[INFO]: START KNKN775 
LOG[INFO]: Scraped 21 rows out of 21 results
Wrote 21 rows to path: search-cache/apps-by-callsign/csv/KNKN775.csv
LOG[INFO]: END KNKN775 (16.539492 seconds)


In [26]:
# sanity check over the data for size of scraped data, and dimensions of callsign tables
assert(len(scraped_data) == len(call_signs))
pd.DataFrame(scraped_data[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,,,,,,,,,
1,KNKA456,0000004285,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,12/21/1998,Granted
2,KNKA456,0000011015,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,11/12/1998,Granted
3,KNKA456,0000015805,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,04/21/1999,Granted
4,KNKA456,0000015852,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,05/02/1999,Granted
5,KNKA456,0000018591,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,08/18/1999,Granted
6,KNKA456,0000025338,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,09/16/1999,Granted
7,KNKA456,0000077568,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,02/04/2000,Granted
8,KNKA456,00001CLMP88,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,New,,CL,10/01/1987,Granted
9,KNKA456,0000244304,KNKA456,DAVENPORT CELLULAR TELEPHONE COMPANY,1702190.0,Modification,,CL,10/23/2000,Dismissed


In [21]:
# last row of first scraped call-sign
for row in scraped_data[0][-1:]:
    print(row)

['KNKA456', '60245CLMP99', 'KNKA456', 'DAVENPORT CELLULAR TELEPHONE COMPANY', '0001702190', 'New', '', 'CL', '01/01/1980', 'Granted']


In [9]:
# print first 10 rows of second scraped callsign
for row in scraped_data[1][:10]:
    print(row)

[]
['0000386266', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Transfer of Control', '', 'AL', '03/05/2001', 'Granted']
['0000388274', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '03/06/2001', 'Granted']
['0000547695', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Transfer of Control', '', 'AL', '08/02/2001', 'Granted']
['0000591187', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Renewal/Modification', '', 'CL', '09/13/2001', 'Granted']
['0000788563', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '02/27/2002', 'Granted']
['0001093516', 'KNKN556', 'Pennsylvania Cellular Telephone Corp.', '0001944032', 'Administrative Update', '', 'CL', '11/18/2002', 'Granted']
['0001178597', '', 'Winston, LLC', '0003297520', 'Transfer of Control', '', 'AL', '01/29/2003', 'Granted']
['0001179154', '', 'Vanguard Cellular Pennsylvania, LLC', '0001944032', 'Assignme

# [TODO]Output scrapped data
* include readme in folder for column names
* save each table as callsign.csv
* note: empty value for call-sign --> part of larger transaction; see "Licenses" tab for list of all callsigns involved in transaction (only for newer applications)

# - Ignore below this line -

This is code that's probably worth keeping as as reference for now -- in case it gives us insight into solving any bugs we run into. But you can ignore it when you run the scraper.

### Read Me
Notes:
* find submit button using form element name, not submit image button
Reference links:
* http://jonathansoma.com/lede/foundations/classes/friday%20sessions/advanced-scraping-form-submissions-completed/

In [None]:
def scrape_table_for_call_sign(call_sign):
    
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    firefox = webdriver.Firefox(executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")
    
    # fill in the text input
    callsign = firefox.find_element_by_id("ulsCallSign")
    callsign.send_keys(call_sign)
    
    # set pagination to 100 (maximum)
    pagination = Select(firefox.find_element_by_xpath("//select[@name='pageSize']"))
    pagination.select_by_visible_text("10")
    
    # submit the search form, wait for load
    firefox.find_element_by_name("search").submit()
    WebDriverWait(firefox, 15).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
    WebDriverWait(firefox, 15).until(lambda _: firefox.execute_script("return document.readyState") == "complete")
    
    #print(firefox.page_source.encode("utf-8"))
    
    #innerHTML = firefox.execute_script("return document.body.innerHTML")
    #print(innerHTML)
    
    # find the link the next page if it exists
    firefox.find_element_by_xpath("//*[@title='Next page of results']").click()

    #next_link = firefox.find_element_by_xpath("//*[@title=\"Next page of results\"][0]")
    #next_link = firefox.find_element_by_xpath("//a[@title='Next page of results']")
    #print(next_link)
    
    #page_links = firefox.find_element_by_xpath("//a[@href]")
    #print(page_links)
    
    # count the number of pages we need to iterate through
    #pages = firefox.find_element_by_xpath("//*[contains(@title, 'Page')]")
    #pages = firefox.find_element_by_css_selector("//[title*='Page']")
    #print(pages)
    
    # gracefully close the webdriver
    time.sleep(10)
    firefox.quit()

In [None]:
scrape_table_for_call_sign("KNKN555")

In [None]:
firefox = webdriver.Firefox(executable_path="./geckodriver")
firefox.get("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp")

# fill in the text input
callsign = firefox.find_element_by_id("ulsCallSign")
callsign.send_keys("KNKN555")

# fill in the dropdown
#dropdown= firefox.find_element_by_id("radioServiceCode")
#dropdown.send_keys("CL - Cellular")

# pagination to max
display = firefox.find_element_by_id("ulsRowsPerPage")
display.send_keys(100)

# submit form
firefox.find_element_by_name("search").submit()
WebDriverWait(firefox, 15).until(EC.url_changes("https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAdvanced.jsp"))
#test = WebDriverWait(firefox, 3).until(EC.staleness_of((By.ID, 'ulsCallSign')))
#test = firefox.implicitly_wait(10);

print(firefox.execute_script("return document.readyState"))

WebDriverWait(firefox, 15).until( lambda _: firefox.execute_script("return document.readyState") == "complete")
print(firefox.current_url)

# scrape out the table
#print(firefox.page_source.encode("utf-8"))
innerHTML = firefox.execute_script("return document.body.innerHTML")
print(innerHTML)

time.sleep(30)
firefox.quit()

In [None]:
ffx = Firefox(executable_path="./geckodriver")
request_data = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": "24",
    "y": "4",
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}
ffx.request("POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=request_data)

In [None]:
options = Options()  
#options.add_argument("--headless")  
chrome = webdriver.Chrome("./chromedriver")
chrome.get("www.google.com.au")
#chrome.request("GET", "https://www.google.com.au")
#chrome.request("GET", "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAppl.jsp")

In [None]:
referrer = "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/searchAppl.jsp"
headers = { "referrer" : referrer }
payload = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": 24,
    "y": 4,
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}

res = requests.post(
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=payload,
    headers=headers,
)

In [None]:
res.history[1].url

In [None]:
options = Options()  
#options.add_argument("--headless")  
chrome = Chrome("./chromedriver", options=options)
chrome.request("GET", "https://www.fcc.gov/")

In [None]:
chrome.request("GET", "https://www.fcc.gov/")

In [None]:
request_data = {
    "basicSearchType": "basicSearchCallSign",
    "basicSearchTerm": "KNKN555",
    "x": 24,
    "y": 4,
    "exactMatchInd": "Y",
    "jsValidated" : "true",
    "currentPage": "searchAppl.jsp",
    "actionElement": "actionApplicationSearch"
}

res = chrome.request(
    "POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data=request_data
)

In [None]:
chrome = Chrome("./chromedriver")
response = chrome.request(
    "POST",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data={
        "basicSearchType": "basicSearchCallSign",
        "basicSearchTerm": "KNKN555",
        "x": 24,
        "y": 4,
        "exactMatchInd": "Y",
        "jsValidated" : "true",
        "currentPage": "searchAppl.jsp",
        "actionElement": "actionApplicationSearch"
    }
)

In [None]:
response = chrome.request(
    "GET",
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/results.jsp?applSearchKey=applSearchKey2019321143892"
)

In [None]:
res = requests.post(
    "https://wireless2.fcc.gov/UlsApp/ApplicationSearch/ApplSearchController",
    data={
        "basicSearchType": "basicSearchCallSign",
        "basicSearchTerm": "KNKN555",
        "x": "24",
        "y": "4",
        "exactMatchInd": "Y",
        "jsValidated" : "true",
        "currentPage": "searchAppl.jsp",
        "actionElement": "actionApplicationSearch"
    }
)