# REFERENCE CODE for Scraping ULS License Database

Used to run market based searchs for 734 Cellular Market Areas.
Returned all callsigns/lease_IDs under a given CMA, along with the most recent licensee's details, and the status of that license. For further clarification see: [Screenshots](https://www.dropbox.com/s/rps2ctf3d7qbsq3/Screen_Capture_cma_search_results.pdf?dl=0)

Key parameters
```python
search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
firefox.find_element_by_name("marketSearch").submit() # THIS IS THE FORM NAME not the SUBMIT BUTTON
soup.find("table", {"summary": "License search results"})
```

## Set up Selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook, or update the path to the executable in :

```python
firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
```

## Import Packages

In [1]:
from bs4 import BeautifulSoup
import time
import random
import csv
import pandas as pd

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

## Define functions for scraping individual CMA

In [None]:
def extract_mkt_code(soup):
    '''
    Helper function that extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [5]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [14]:
def cache_innerHTML(mkt_code, cache_to, without_browser=False):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to '[cache_to]/[mkt_code].html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START SCRAPE %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load -- THIS IS THE FORM NAME not the SUBMIT BUTTON
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML & cache
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    
    with open("%s/%s.html" % (cache_to, mkt_code), 'w') as f:
        f.write(innerHTML)
    
    # quit webdriver
    firefox.quit()
        
    tock = time.time()
    print("LOG[INFO]: END SCRAPE %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return innerHTML

In [7]:
def scrape_table_for_cma(mkt_code, cache_to='search-cache', without_browser=False):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to '[cache_to]/[mkt_code].html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''
    
    # spin up webdriver & query database
    innerHTML = cache_innerHTML(mkt_code, cache_to, without_browser)
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
    
    return table_data

## Define functions for searching for list of CMAs

For (messy) production runs see archive-code/scrape-cma_xx.ipynb


In [8]:
def loop_scrape_table_for_cma(list_cma, max_tries=3, wait_query=True, wait_retry=True):
    '''
    Attempts to scrape for cma codes in `list_cma` up to `max_tries`. 
    Waits between 1-60 secs between queries, and 200-500secs between any interruptions to try loop again.
    Caches .html for each search.
    Returns all data in loop as list
    '''
    try:
        # start search & tries counter
        i = 0
        tries = 1

        # log start of loop
        tick = time.time()
        print(
            "LOG[INFO]: START TIME is %s\n"
            "LOG[INFO]: START LOOP from %s"
            % (time.strftime("%I:%M %p"), list_cma[i])
        )

        # begin search & scrape loop over mkt_codes
        loop_data = []

        # Attempt to scrape all cma's in list_cma up to 3 times
        while (i < len(list_cma) and tries <= max_tries):
            print("LOG[INFO]: Attempt", tries)
            # Succcessful iteration
            try:
                # search for i-th CMA
                mkt_code = list_cma[i]
                result = scrape_table_for_cma(mkt_code, True)

                loop_data += result
            
                # wait random time before next query
                i += 1
                waitTime_query = (
                    (random.randint(1,60)) 
                    if wait_query == True 
                    else 0
                )
                print("LOG[INFO]: WAIT %2f seconds before next query" % waitTime_query)
                time.sleep(waitTime_query)

            # Interrupted iteration logs time & error, and waits for retry
            except Exception as e:
                print(
                    "LOG[ERROR]: %s\n"
                    "LOG[INFO]: LOOP INTERRUPTED at %s\n"
                    % (e, time.strftime("%I:%M %p"))
                )
                # wait for next try
                tries += 1
                waitTime_retry = (
                    (random.randint(200,500)) 
                    if wait_retry == True 
                    else 0
                )
                if tries <= max_tries:
                    print("LOG[INFO]: WAIT %2f minutes to retry" % (waitTime_retry/60))
                    time.sleep(waitTime_retry)
                else:
                    pass

        # log end of while loop
        tock = time.time()
        print(
            "LOG[INFO]: END LOOP at %s with %s ATTEMPTS\n"
            "LOG[INFO]: END TIME is %s (TOTAL: %2f minutes)"
            % (list_cma[(i-1)], (tries), time.strftime("%I:%M %p"), ((tock - tick)/60))
        )
    
    finally:
        return (loop_data)
