# FCC CMA Scraping (AMEND)

## NOTE: This is a re-run of scraping functions for CMA's with results equal to 10
* see scrape-callsigns_by_cma.ipynb for initial searchs

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [1]:
from bs4 import BeautifulSoup
import time
import csv

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

## Define functions for scraping individual CMA

In [3]:
def extract_mkt_code(soup):
    '''
    Extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [4]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [5]:
def scrape_table_for_cma(mkt_code, without_browser=False, cache_to='search-cache'):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to 'search-cache/mkt_code.html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START SCRAPE %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML & cache
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    
    with ("%s/%s.html" (cache_to,mkt_code)) as f:
        f.write(innerHTML)
        f.close()
    #firefox.quit()
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
        
    tock = time.time()
    print("LOG[INFO]: END SCRAPE %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return table_data

In [6]:
! ls search-cache
scrape_table_for_cma("CMA002", False)

LOG[INFO]: START SCRAPE CMA002 


WebDriverException: Message: Reached error page: about:neterror?e=dnsNotFound&u=https%3A//wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp&c=UTF-8&f=regular&d=We%20can%E2%80%99t%20connect%20to%20the%20server%20at%20wireless2.fcc.gov.


## Define functions for searching for list of CMAs

In [14]:
import random

In [60]:
def loop_scrape_table_for_cma(list_cma, max_tries=3, wait_query=True, wait_retry=True):
    '''
    Attempts to scrape for cma codes in `list_cma` up to `max_tries`
    '''
    
    # start search & tries counter
    i = 0
    tries = 1
    
    # log start of loop
    tick = time.time()
    print(
        "LOG[INFO]: START TIME is %s\n"
        "LOG[INFO]: START LOOP from %s"
        % (time.strftime("%I:%M %p"), list_cma[i])
    )
    
    # begin search & scrape loop over mkt_codes
    loop_data = []
    
    # Attempt to scrape all cma's in list_cma up to 3 times
    while (i < len(list_cma) and tries <= max_tries):
        print("LOG[INFO]: Attempt", tries)
        # Succcessful iteration
        try:
            # search for i-th CMA
            mkt_code = list_cma[i]
            loop_data.append(
                scrape_table_for_cma(mkt_code, True)
            )
            i += 1
            # wait random time before next query
            waitTime_query = (
                (random.randint(1,60)) 
                if wait_query == True 
                else 0
            )
            print("LOG[INFO]: WAIT %2f seconds before next query" % waitTime_query)
            time.sleep(waitTime_query)
    
        # Interrupted iteration logs time & error, and waits for retry
        except Exception as e:
            print(
                "LOG[ERROR]: %s\n"
                "LOG[INFO]: LOOP INTERRUPTED at %s\n"
                % (e, time.strftime("%I:%M %p"))
            )
            # cache this attempt
            with open('data-cache/attempt-%s.csv' % tries, newline='') as f:
                writer = csv.writer(f)
                writer.writelines(loop_data)
            # wait for next try
            tries += 1
            waitTime_retry = (
                (random.randint(200,500)) 
                if wait_retry == True 
                else 0
            )
            if tries <= max_tries:
                print("LOG[INFO]: WAIT %2f minutes to retry" % (waitTime_retry/60))
                time.sleep(waitTime_retry)
            else:
                pass
       
    # log end of while loop
    tock = time.time()
    print(
        "LOG[INFO]: END LOOP at %s with %s ATTEMPTS\n"
        "LOG[INFO]: END TIME is %s (TOTAL: %2f minutes)"
        % (list_cma[(i)], (tries-1), time.strftime("%I:%M %p"), ((tock - tick)/60))
    )
    
    return (loop_data)


## Scraping results tables for list of CMA's

### Creating search lists

In [16]:
import pandas as pd

In [19]:
# load previous search results
cma_df = pd.read_csv("data-cache/cma_df.csv", dtype='object')

# select for CMA with count >= 10, and return as list
rows_per_cma = cma_df.groupby(['cma_no']).count()
IS_gt_10 = (rows_per_cma['callsign_leaseID'] == 10)
cma_gt_10 = rows_per_cma[IS_gt_10].reset_index()['cma_no']
cma_gt_10_search = cma_gt_10.tolist()
cma_gt_10_search

['CMA002',
 'CMA014',
 'CMA015',
 'CMA024',
 'CMA026',
 'CMA028',
 'CMA029',
 'CMA039',
 'CMA041',
 'CMA045',
 'CMA053',
 'CMA071',
 'CMA074',
 'CMA077',
 'CMA081',
 'CMA086',
 'CMA089',
 'CMA090',
 'CMA097',
 'CMA098',
 'CMA101',
 'CMA106',
 'CMA113',
 'CMA127',
 'CMA128',
 'CMA141',
 'CMA152',
 'CMA161',
 'CMA162',
 'CMA165',
 'CMA166',
 'CMA168',
 'CMA171',
 'CMA174',
 'CMA176',
 'CMA181',
 'CMA185',
 'CMA187',
 'CMA188',
 'CMA217',
 'CMA221',
 'CMA227',
 'CMA234',
 'CMA245',
 'CMA246',
 'CMA255',
 'CMA264',
 'CMA267',
 'CMA268',
 'CMA273',
 'CMA274',
 'CMA281',
 'CMA283',
 'CMA285',
 'CMA286',
 'CMA289',
 'CMA290',
 'CMA295',
 'CMA297',
 'CMA298',
 'CMA299',
 'CMA306',
 'CMA307',
 'CMA311',
 'CMA313',
 'CMA315',
 'CMA316',
 'CMA317',
 'CMA318',
 'CMA319',
 'CMA320',
 'CMA322',
 'CMA323',
 'CMA336',
 'CMA337',
 'CMA341',
 'CMA342',
 'CMA344',
 'CMA348',
 'CMA351',
 'CMA352',
 'CMA353',
 'CMA354',
 'CMA355',
 'CMA356',
 'CMA361',
 'CMA362',
 'CMA363',
 'CMA367',
 'CMA369',
 'CMA374',

In [22]:
# should have ran amend = loop_scrape_table_for_cma(cma_gt_10_search)
amend = []
amend.append(loop_scrape_table_for_cma(cma_gt_10_search))

LOG[INFO]: START TIME is 12:50 PM
LOG[INFO]: START LOOP from CMA002
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA002 
LOG[INFO]: END SCRAPE CMA002 (19.217074 seconds)
LOG[INFO]: WAIT 18.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA014 
LOG[INFO]: END SCRAPE CMA014 (18.019069 seconds)
LOG[INFO]: WAIT 29.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA015 
LOG[INFO]: END SCRAPE CMA015 (22.973442 seconds)
LOG[INFO]: WAIT 25.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA024 
LOG[INFO]: END SCRAPE CMA024 (14.250369 seconds)
LOG[INFO]: WAIT 50.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA026 
LOG[INFO]: END SCRAPE CMA026 (17.270095 seconds)
LOG[INFO]: WAIT 57.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA028 
LOG[INFO]: END SCRAPE CMA028 (22.470814 seconds)
LOG[INFO]: WAIT 49.000000 seconds before next query
LOG[INFO

LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA281 
LOG[INFO]: END SCRAPE CMA281 (19.724655 seconds)
LOG[INFO]: WAIT 24.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA283 
LOG[INFO]: END SCRAPE CMA283 (19.444132 seconds)
LOG[INFO]: WAIT 49.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA285 
LOG[INFO]: END SCRAPE CMA285 (18.142432 seconds)
LOG[INFO]: WAIT 27.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA286 
LOG[INFO]: END SCRAPE CMA286 (21.506659 seconds)
LOG[INFO]: WAIT 37.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA289 
LOG[INFO]: END SCRAPE CMA289 (21.352690 seconds)
LOG[INFO]: WAIT 57.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA290 
LOG[INFO]: END SCRAPE CMA290 (23.166645 seconds)
LOG[INFO]: WAIT 29.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA295 
LOG[INFO]: END SCRAPE C

In [38]:
# unpack unnecessary list level amend[0] is 1 element/

amend1 = amend[0]
len(amend1)

68

In [42]:
# cache search to csv

import csv

with open("data-cache/amend-01.csv", 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(amend1)

with open("data-cache/amend-01.csv", newline='') as f:
    reader = csv.reader(f)
    check = list(reader)

In [43]:
check

[["['CMA002', 'KNKA209', 'Los Angeles SMSA Limited Partnership', '0002963817', 'CL', 'Active', '10/01/2024']",
  "['CMA002', 'KNKA351', 'AT&T Mobility Spectrum LLC', '0014980726', 'CL', 'Active', '10/01/2027']",
  "['CMA002', 'KNKR294', 'FOLDEN, GENE A', '', 'CL', 'Terminated', '']",
  "['CMA002', 'KNKR313', 'LOS ANGELES SMSA LIMITED PARTNERSHIP', '0002963817', 'CL', 'Canceled', '09/24/2007']",
  "['CMA002', 'L000009285', 'AirTouch Cellular', '0006146468', 'WY', 'Canceled', '06/13/2019']",
  "['CMA002', 'L000010911', 'Cellco Partnership', '0003290673', 'AW', 'Canceled', '01/25/2014']",
  "['CMA002', 'L000010913', 'New Cingular Wireless PCS, LLC', '0003291192', 'WY', 'Canceled', '01/25/2014']",
  "['CMA002', 'L000015067', 'Screened Images, Inc.', '0021044003', 'CL', 'Active', '10/01/2027']",
  "['CMA002', 'L000023743', 'Screened Images, Inc.', '0021044003', 'AW', 'Active', '11/29/2021']",
  "['CMA002', 'WPOI449', 'Los Angeles SMSA Limited Partnership-AirTouch Cellular', '0002963817', 'C

### amend search 2

In [1]:
amend_search2 = cma_gt_10_search[68:100]

NameError: name 'cma_gt_10_search' is not defined

In [None]:
amend2 = loop_scrape_table_for_cma(amend_search2)

LOG[INFO]: START TIME is 03:56 PM
LOG[INFO]: START LOOP from CMA318
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA318 
LOG[INFO]: END SCRAPE CMA318 (27.119785 seconds)
LOG[INFO]: WAIT 51.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA319 
LOG[INFO]: END SCRAPE CMA319 (23.987354 seconds)
LOG[INFO]: WAIT 60.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA320 
LOG[INFO]: END SCRAPE CMA320 (18.816243 seconds)
LOG[INFO]: WAIT 15.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA322 
LOG[INFO]: END SCRAPE CMA322 (19.085514 seconds)
LOG[INFO]: WAIT 38.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA323 
LOG[INFO]: END SCRAPE CMA323 (35.311683 seconds)
LOG[INFO]: WAIT 55.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA336 
LOG[INFO]: END SCRAPE CMA336 (28.157582 seconds)
LOG[INFO]: WAIT 23.000000 seconds before next query
LOG[INFO

LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA426 
LOG[INFO]: END SCRAPE CMA426 (19.479171 seconds)
LOG[INFO]: WAIT 5.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA427 
LOG[INFO]: END SCRAPE CMA427 (20.711828 seconds)
LOG[INFO]: WAIT 35.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA428 
LOG[INFO]: END SCRAPE CMA428 (20.724603 seconds)
LOG[INFO]: WAIT 25.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA429 
LOG[INFO]: END SCRAPE CMA429 (22.813053 seconds)
LOG[INFO]: WAIT 7.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA430 
LOG[INFO]: END SCRAPE CMA430 (24.021682 seconds)
LOG[INFO]: WAIT 30.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA431 
LOG[INFO]: END SCRAPE CMA431 (27.255892 seconds)
LOG[INFO]: WAIT 54.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA432 
LOG[INFO]: END SCRAPE CMA

LOG[INFO]: END SCRAPE CMA569 (23.907539 seconds)
LOG[INFO]: WAIT 33.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA571 
LOG[INFO]: END SCRAPE CMA571 (22.288681 seconds)
LOG[INFO]: WAIT 51.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA572 
LOG[INFO]: END SCRAPE CMA572 (26.438697 seconds)
LOG[INFO]: WAIT 57.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA573 
LOG[INFO]: END SCRAPE CMA573 (24.760632 seconds)
LOG[INFO]: WAIT 19.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA575 
LOG[INFO]: END SCRAPE CMA575 (26.498888 seconds)
LOG[INFO]: WAIT 16.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA576 
LOG[INFO]: END SCRAPE CMA576 (20.404939 seconds)
LOG[INFO]: WAIT 6.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA577 
LOG[INFO]: END SCRAPE CMA577 (25.862363 seconds)
LOG[INFO]: WAIT 9.000000 sec

#### --- ignore test code ---