# FCC Application Scraping

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [1]:
from bs4 import BeautifulSoup
import time

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

In [157]:
import os

## Test Search (CMA001)

* https://stackabuse.com/python-check-if-a-file-or-directory-exists/
* template for `scrape_table_for_CMA()`

In [162]:
if os.path.isfile("./test-search.html"):
    print("test-search.html exists")

else:
    print("LOG[INFO]: Running Test Search")

    # Scrape Test Search results page for test market
    mkt_num = "CMA001"

    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)

    # spin up the webdriver
    firefox = webdriver.Firefox(executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp")

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))

    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))

    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_num)

    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()
    WebDriverWait(firefox, 15).until(
        EC.url_changes("https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete")

    # scrape testhtml soup
    testHTML = firefox.execute_script("return document.body.innerHTML")

    with open("test-search.html", "w") as f:
        f.write(testHTML)
    
    firefox.quit()

test-search.html exists


In [150]:
# making the soup, finding the results table
with open("test-search.html", "r") as f:
    testHTML = f.read()

soup = BeautifulSoup(testHTML, "lxml")
table_page = soup.find("table", {"summary": "License search results"})

In [168]:
# finding search col names
first_row_cols = table_page.find_all("tr")[0].find_all("th")
search_cols = []
for i in range(0, len(first_row_cols)):
    cell_data = first_row_cols[i].get_text().strip()
    search_cols.append(cell_data)
print(search_cols)

['Call Sign/Lease ID', 'Name', 'FRN', 'Radio Service', 'Status', 'Expiration Date']


In [169]:
# defining extracted table column names
col_names = ['Market Number', 'Call Sign/Lease ID', 'FRN', 'Radio Service', 'Status', 'Expiration Date']

## Define functions for scraping individual CMA

* https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#arg-attrs
* https://stackoverflow.com/questions/4768941/how-to-break-a-line-of-chained-methods-in-python

In [171]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign
        * FRN
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [170]:
def extract_mkt_code(soup):
    '''
    Extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [172]:
extract_table_data(innerHTML)

[['CMA001',
  'KNKA206',
  'New York SMSA Limited Partnership',
  '0003473220',
  'CL',
  'Active',
  '10/01/2024'],
 ['CMA001',
  'KNKA310',
  'NEW CINGULAR WIRELESS PCS, LLC',
  '0003291192',
  'CL',
  'Active',
  '10/01/2025'],
 ['CMA001',
  'L000009970',
  'T-Mobile License LLC',
  '0001565449',
  'AW',
  'Canceled',
  '02/28/2013'],
 ['CMA001',
  'WPWU948',
  'New Cingular Wireless PCS, LLC',
  '0003291192',
  'WZ',
  'Active',
  '06/13/2019'],
 ['CMA001',
  'WQGB263',
  'Cellco Partnership',
  '0003290673',
  'AW',
  'Active',
  '11/29/2021'],
 ['CMA001',
  'WQJU424',
  'New Cingular Wireless PCS, LLC',
  '0003291192',
  'WY',
  'Active',
  '06/13/2019'],
 ['CMA001',
  'WQWQ745',
  'Northstar Wireless, LLC',
  '0023917453',
  'AT',
  'Active',
  '10/27/2027']]

In [173]:
def scrape_table_for_cma(mkt_code, without_browser=False):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign
        * FRN
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    firefox.quit()
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
        
    tock = time.time()
    print("LOG[INFO]: END %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return table_data

In [166]:
scrape_table_for_cma("CMA001", False)

LOG[INFO]: START CMA001 
LOG[INFO]: END CMA001 (30.416177 seconds)


[['CMA001',
  'KNKA206',
  'New York SMSA Limited Partnership',
  '0003473220',
  'CL',
  'Active',
  '10/01/2024'],
 ['CMA001',
  'KNKA310',
  'NEW CINGULAR WIRELESS PCS, LLC',
  '0003291192',
  'CL',
  'Active',
  '10/01/2025'],
 ['CMA001',
  'L000009970',
  'T-Mobile License LLC',
  '0001565449',
  'AW',
  'Canceled',
  '02/28/2013'],
 ['CMA001',
  'WPWU948',
  'New Cingular Wireless PCS, LLC',
  '0003291192',
  'WZ',
  'Active',
  '06/13/2019'],
 ['CMA001',
  'WQGB263',
  'Cellco Partnership',
  '0003290673',
  'AW',
  'Active',
  '11/29/2021'],
 ['CMA001',
  'WQJU424',
  'New Cingular Wireless PCS, LLC',
  '0003291192',
  'WY',
  'Active',
  '06/13/2019'],
 ['CMA001',
  'WQWQ745',
  'Northstar Wireless, LLC',
  '0023917453',
  'AT',
  'Active',
  '10/27/2027']]

## Scraping results tables for all CMA's

* https://stackoverflow.com/questions/2050637/appending-the-same-string-to-a-list-of-strings-in-python
* https://www.csestack.org/python-padding-number-string/
* https://stackoverflow.com/questions/18265935/python-create-list-with-numbers-between-2-values

In [69]:
cma_count = 734
cma_range = list(range(1,cma_count + 1))    # 734 CMA's in total
cma_search_list = ['CMA' + str(s).zfill(3) for s in CMA_range]
assert(len(cma_search_list) == 734)

In [None]:
def get_mktsearch_results(mkt_num)
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # spin up the webdriver
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp")

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_num)

    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

In [4]:
scrape_table_for_cma("CMA001")
firefox.quit()

NameError: name 'firefox' is not defined

## Scrape the FCC tables for a list of call signs

In [12]:
# read in the list of FCC callsigns
call_signs = ["KNKN555", "KNKN556"]

In [13]:
# collects the scraped data for *all* the call signs into a single python list
scraped_data = []
for cs in call_signs:
    scraped_data.append(scrape_table_for_call_sign(cs, without_browser=False))

NameError: name 'scrape_table_for_call_sign' is not defined

In [18]:
# sanity check over data

In [54]:
# print first 10 rows
for row in scraped_data[:10]:
    print(row)

['0000083337', '', 'AT&T Wireless Services of Florida, Inc.', '', 'Assignment of Authorization', '', 'AL', '02/18/2000', 'Consummated']
['0000214527', 'KNKN555', 'AT&T Wireless Services of Florida, Inc.', '', 'Renewal/Modification', '', 'CL', '09/05/2000', 'Granted']
['0000287045', 'KNKN555', 'AT&T Wireless Services of Florida, Inc.', '', 'Duplicate License', '', 'CL', '12/08/2000', 'Granted']
['0000326671', 'KNKN555', 'AT&T Wireless Services of Florida, Inc.', '', 'Duplicate License', '', 'CL', '01/16/2001', 'Granted']
['0000446751', 'KNKN555', 'AT&T Wireless Services of Florida, Inc.', '', 'Administrative Update', '', 'CL', '05/02/2001', 'Granted']
['0000545823', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Transfer of Control', '', 'AL', '08/01/2001', 'Inactive']
['0000545823', '', 'AT&T Wireless Services, Inc.', '0004122032', 'Amendment', '(Transfer of Control)', 'AL', '12/20/2001', 'Granted']
['0000949139', 'KNKN555', 'AT&T Wireless Services of Florida, Inc.', '', 'Modificat