# FCC CMA Scraping (AMEND)

## NOTE: This is a re-run of scraping functions for CMA's with results equal to 10
* see scrape-callsigns_by_cma.ipynb for initial searchs

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [2]:
from bs4 import BeautifulSoup
import time
import csv

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

## Define functions for scraping individual CMA

In [4]:
def extract_mkt_code(soup):
    '''
    Extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [5]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [14]:
def cache_innerHTML(mkt_code, without_browser=False, cache_to='search-cache'):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to 'search-cache/mkt_code.html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START SCRAPE %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML & cache
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    
    with open("%s/%s.html" % (cache_to, mkt_code), 'w') as f:
        f.write(innerHTML)
    
    firefox.quit()
        
    tock = time.time()
    print("LOG[INFO]: END SCRAPE %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return innerHTML

In [15]:
def scrape_table_for_cma(mkt_code, without_browser=False, cache_to='search-cache'):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to 'search-cache/mkt_code.html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''
    
    # spin up webdriver & query database
    innerHTML = cache_innerHTML(mkt_code, without_browser, cache_to)
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
    
    return table_data

In [16]:
scrape_table_for_cma("CMA002", False)

LOG[INFO]: START SCRAPE CMA002 


FileNotFoundError: [Errno 2] No such file or directory: 'search-cache/CMA002.html'

## Define functions for searching for list of CMAs

In [8]:
import random

In [30]:
def loop_scrape_table_for_cma(list_cma, max_tries=3, wait_query=True, wait_retry=True):
    '''
    Attempts to scrape for cma codes in `list_cma` up to `max_tries`
    '''
    try:
        # start search & tries counter
        i = 0
        tries = 1

        # log start of loop
        tick = time.time()
        print(
            "LOG[INFO]: START TIME is %s\n"
            "LOG[INFO]: START LOOP from %s"
            % (time.strftime("%I:%M %p"), list_cma[i])
        )

        # begin search & scrape loop over mkt_codes
        loop_data = []

        # Attempt to scrape all cma's in list_cma up to 3 times
        while (i < len(list_cma) and tries <= max_tries):
            print("LOG[INFO]: Attempt", tries)
            # Succcessful iteration
            try:
                # search for i-th CMA
                mkt_code = list_cma[i]
                result = scrape_table_for_cma(mkt_code, True)

                # cache result so far
                loop_data.append(result)
                with open('search-cache/amend_%s.csv' % list_cma[0], 'a',  newline='') as f:
                    cache_writer = csv.writer(f)
                    cache_writer.writerow(result)
                # wait random time before next query
                i += 1
                waitTime_query = (
                    (random.randint(1,60)) 
                    if wait_query == True 
                    else 0
                )
                print("LOG[INFO]: WAIT %2f seconds before next query" % waitTime_query)
                time.sleep(waitTime_query)

            # Interrupted iteration logs time & error, and waits for retry
            except Exception as e:
                print(
                    "LOG[ERROR]: %s\n"
                    "LOG[INFO]: LOOP INTERRUPTED at %s\n"
                    % (e, time.strftime("%I:%M %p"))
                )
                # wait for next try
                tries += 1
                waitTime_retry = (
                    (random.randint(200,500)) 
                    if wait_retry == True 
                    else 0
                )
                if tries <= max_tries:
                    print("LOG[INFO]: WAIT %2f minutes to retry" % (waitTime_retry/60))
                    time.sleep(waitTime_retry)
                else:
                    pass

        # log end of while loop
        tock = time.time()
        print(
            "LOG[INFO]: END LOOP at %s with %s ATTEMPTS\n"
            "LOG[INFO]: END TIME is %s (TOTAL: %2f minutes)"
            % (list_cma[(i-1)], (tries), time.strftime("%I:%M %p"), ((tock - tick)/60))
        )
    
    finally:
        return (loop_data)


## Scraping results tables for list of CMA's

### Creating search lists

In [31]:
import pandas as pd

In [11]:
# load previous search results
cma_df = pd.read_csv("data-cache/cma_df.csv", dtype='object')

# select for CMA with count >= 10, and return as list
rows_per_cma = cma_df.groupby(['cma_no']).count()
IS_gt_10 = (rows_per_cma['callsign_leaseID'] == 10)
cma_gt_10 = rows_per_cma[IS_gt_10].reset_index()['cma_no']
cma_gt_10_search = cma_gt_10.tolist()
cma_gt_10_search

['CMA002',
 'CMA014',
 'CMA015',
 'CMA024',
 'CMA026',
 'CMA028',
 'CMA029',
 'CMA039',
 'CMA041',
 'CMA045',
 'CMA053',
 'CMA071',
 'CMA074',
 'CMA077',
 'CMA081',
 'CMA086',
 'CMA089',
 'CMA090',
 'CMA097',
 'CMA098',
 'CMA101',
 'CMA106',
 'CMA113',
 'CMA127',
 'CMA128',
 'CMA141',
 'CMA152',
 'CMA161',
 'CMA162',
 'CMA165',
 'CMA166',
 'CMA168',
 'CMA171',
 'CMA174',
 'CMA176',
 'CMA181',
 'CMA185',
 'CMA187',
 'CMA188',
 'CMA217',
 'CMA221',
 'CMA227',
 'CMA234',
 'CMA245',
 'CMA246',
 'CMA255',
 'CMA264',
 'CMA267',
 'CMA268',
 'CMA273',
 'CMA274',
 'CMA281',
 'CMA283',
 'CMA285',
 'CMA286',
 'CMA289',
 'CMA290',
 'CMA295',
 'CMA297',
 'CMA298',
 'CMA299',
 'CMA306',
 'CMA307',
 'CMA311',
 'CMA313',
 'CMA315',
 'CMA316',
 'CMA317',
 'CMA318',
 'CMA319',
 'CMA320',
 'CMA322',
 'CMA323',
 'CMA336',
 'CMA337',
 'CMA341',
 'CMA342',
 'CMA344',
 'CMA348',
 'CMA351',
 'CMA352',
 'CMA353',
 'CMA354',
 'CMA355',
 'CMA356',
 'CMA361',
 'CMA362',
 'CMA363',
 'CMA367',
 'CMA369',
 'CMA374',

In [None]:
break

In [22]:
# should have ran amend = loop_scrape_table_for_cma(cma_gt_10_search)
amend = []
amend.append(loop_scrape_table_for_cma(cma_gt_10_search))

LOG[INFO]: START TIME is 12:50 PM
LOG[INFO]: START LOOP from CMA002
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA002 
LOG[INFO]: END SCRAPE CMA002 (19.217074 seconds)
LOG[INFO]: WAIT 18.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA014 
LOG[INFO]: END SCRAPE CMA014 (18.019069 seconds)
LOG[INFO]: WAIT 29.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA015 
LOG[INFO]: END SCRAPE CMA015 (22.973442 seconds)
LOG[INFO]: WAIT 25.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA024 
LOG[INFO]: END SCRAPE CMA024 (14.250369 seconds)
LOG[INFO]: WAIT 50.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA026 
LOG[INFO]: END SCRAPE CMA026 (17.270095 seconds)
LOG[INFO]: WAIT 57.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA028 
LOG[INFO]: END SCRAPE CMA028 (22.470814 seconds)
LOG[INFO]: WAIT 49.000000 seconds before next query
LOG[INFO

LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA281 
LOG[INFO]: END SCRAPE CMA281 (19.724655 seconds)
LOG[INFO]: WAIT 24.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA283 
LOG[INFO]: END SCRAPE CMA283 (19.444132 seconds)
LOG[INFO]: WAIT 49.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA285 
LOG[INFO]: END SCRAPE CMA285 (18.142432 seconds)
LOG[INFO]: WAIT 27.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA286 
LOG[INFO]: END SCRAPE CMA286 (21.506659 seconds)
LOG[INFO]: WAIT 37.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA289 
LOG[INFO]: END SCRAPE CMA289 (21.352690 seconds)
LOG[INFO]: WAIT 57.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA290 
LOG[INFO]: END SCRAPE CMA290 (23.166645 seconds)
LOG[INFO]: WAIT 29.000000 seconds before next query
LOG[INFO]: Attempt 3
LOG[INFO]: START SCRAPE CMA295 
LOG[INFO]: END SCRAPE C

In [38]:
# unpack unnecessary list level amend[0] is 1 element/

amend1 = amend[0]
len(amend1)

68

In [42]:
# cache search to csv

import csv

with open("data-cache/amend-01.csv", 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(amend1)

with open("data-cache/amend-01.csv", newline='') as f:
    reader = csv.reader(f)
    check = list(reader)

### amend search 2

In [16]:
amend2 = loop_scrape_table_for_cma(cma_gt_10_search[68:100])

LOG[INFO]: START TIME is 09:19 PM
LOG[INFO]: START LOOP from CMA318
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA318 
LOG[INFO]: END SCRAPE CMA318 (19.929443 seconds)
LOG[INFO]: WAIT 18.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA319 
LOG[INFO]: END SCRAPE CMA319 (15.647868 seconds)
LOG[INFO]: WAIT 23.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA320 
LOG[INFO]: END SCRAPE CMA320 (16.253046 seconds)
LOG[INFO]: WAIT 31.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA322 
LOG[INFO]: END SCRAPE CMA322 (17.963480 seconds)
LOG[INFO]: WAIT 3.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA323 
LOG[INFO]: END SCRAPE CMA323 (20.034600 seconds)
LOG[INFO]: WAIT 46.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA336 
LOG[INFO]: END SCRAPE CMA336 (18.386420 seconds)
LOG[INFO]: WAIT 2.000000 seconds before next query
LOG[INFO]:

IndexError: list index out of range

#### --- ignore test code ---

In [26]:
with open("search-cache/amend_CMA318_380.csv", newline='') as f:
    reader = csv.reader(f)
    check = list(reader)

In [22]:
check

[["['CMA318', 'KNKN268', 'AT&T Mobility Spectrum LLC', '0014980726', 'CL', 'Active', '10/01/2020']",
  "['CMA318', 'KNKQ365', 'Verizon Wireless (VAW) LLC', '0003800307', 'CL', 'Active', '10/01/2022']",
  "['CMA318', 'KNKR314', 'ALLTEL Corporation', '0002942159', 'CL', 'Active', '01/19/2029']",
  "['CMA318', 'L000022707', 'AT&T Mobility Spectrum LLC', '0014980726', 'WY', 'Expired', '09/22/2016']",
  "['CMA318', 'L000022708', 'AT&T Mobility Spectrum LLC', '0014980726', 'WY', 'Canceled', '09/22/2017']",
  "['CMA318', 'WPQL769', 'Dobson Cellular Systems, Inc.', '', 'CL', 'Expired', '11/23/2000']",
  "['CMA318', 'WPWV241', 'AT&T Mobility Spectrum LLC', '0014980726', 'WZ', 'Active', '06/13/2019']",
  "['CMA318', 'WQGD712', 'AT&T Mobility Spectrum LLC', '0014980726', 'AW', 'Active', '12/18/2021']",
  "['CMA318', 'WQJQ674', 'Data-Max Wireless, LLC', '0017166422', 'WY', 'Active', '06/13/2019']",
  "['CMA318', 'WQKQ560', 'Tisdale Telephone Company, LLC', '0018115550', 'CL', 'Terminated', '08/12/

### amend search 3

In [44]:
# didn't cache table from CMA381 - CMA389
# can scrape html later

print(cma_gt_10_search.index('CMA381'))
print(cma_gt_10_search.index('CMA389'))

amend4 = loop_scrape_table_for_cma(cma_gt_10_search[95:100])

95
99
LOG[INFO]: START TIME is 11:59 PM
LOG[INFO]: START LOOP from CMA381
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA381 
LOG[INFO]: END SCRAPE CMA381 (18.024175 seconds)
LOG[INFO]: WAIT 9.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA383 
LOG[INFO]: END SCRAPE CMA383 (14.747115 seconds)
LOG[INFO]: WAIT 54.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA386 
LOG[INFO]: END SCRAPE CMA386 (14.927607 seconds)
LOG[INFO]: WAIT 40.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA388 
LOG[INFO]: END SCRAPE CMA388 (14.128808 seconds)
LOG[INFO]: WAIT 33.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA389 
LOG[INFO]: END SCRAPE CMA389 (13.332331 seconds)
LOG[INFO]: WAIT 19.000000 seconds before next query
LOG[INFO]: END LOOP at CMA389 with 0 ATTEMPTS
LOG[INFO]: END TIME is 12:03 AM (TOTAL: 3.838417 minutes)


In [47]:
with open('search-cache/amend_CMA381_389.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(amend4)

with open('search-cache/amend_CMA381_389.csv', 'r', newline='') as f:
    reader = csv.reader(f)
    check4 = list(reader)

In [50]:
check4

[["['CMA381', 'KNKN706', 'New Cingular Wireless PCS, LLC', '0003291192', 'CL', 'Active', '10/01/2021']",
  "['CMA381', 'KNKN962', 'ALLTEL Corporation', '0002942159', 'CL', 'Active', '10/01/2020']",
  "['CMA381', 'L000018138', 'Cellblox Acquisitions, LLC', '0024202053', 'WZ', 'Canceled', '12/18/2016']",
  "['CMA381', 'L000018140', 'Cellblox Acquisitions, LLC', '0024202053', 'WY', 'Canceled', '12/18/2016']",
  "['CMA381', 'L000020830', 'Cellblox Acquisitions, LLC', '0024202053', 'WZ', 'Active', '06/13/2019']",
  "['CMA381', 'L000020833', 'Cellblox Acquisitions, LLC', '0024202053', 'WY', 'Active', '06/13/2019']",
  "['CMA381', 'L000030954', 'CellBlox Acquisitions, LLC', '0024202053', 'AW', 'Active', '12/18/2021']",
  "['CMA381', 'WPYZ966', 'New Cingular Wireless PCS, LLC', '0003291192', 'WZ', 'Active', '06/13/2019']",
  "['CMA381', 'WQGD469', 'Cellco Partnership', '0003290673', 'AW', 'Active', '12/18/2021']",
  "['CMA381', 'WQIZ418', 'New Cingular Wireless PCS, LLC', '0003291192', 'WY', '

In [38]:
with open("search-cache/amend_CMA390-722.csv", 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(amend3_1)



FileNotFoundError: [Errno 2] No such file or directory: 'search-cache/amend_CMA390_722.csv'

In [42]:
with open("search-cache/amend_CMA390-722.csv", 'r', newline='') as f:
    reader = csv.reader(f)
    check3 = list(reader)

In [43]:
check3

[['CMA390',
  'KNKN338',
  'DANBURY CELLULAR TELEPHONE COMPANY, INC.',
  '',
  'CL',
  'Terminated',
  ''],
 ['CMA390',
  'KNKN400',
  'CommNet Cellular Inc.',
  '0017329715',
  'CL',
  'Active',
  '10/01/2020'],
 ['CMA390', 'KNKP976', 'WWC HOLDING CO., INC.', '', 'CL', 'Terminated', ''],
 ['CMA390', 'KNKR254', 'LANTY L. SMITH', '', 'CL', 'Canceled', '10/01/2005'],
 ['CMA390',
  'KNKR296',
  'AT&T Mobility Spectrum LLC',
  '0014980726',
  'CL',
  'Active',
  '10/27/2027'],
 ['CMA390',
  'KNKR322',
  'AT&T Mobility Spectrum LLC',
  '0014980726',
  'CL',
  'Active',
  '01/14/2028'],
 ['CMA390',
  'L000004274',
  'W. Stephen Cannon, Management Trustee',
  '0017119025',
  'CL',
  'Expired',
  '12/30/2009'],
 ['CMA390',
  'L000004962',
  'W. Stephen Cannon, Management Trustee',
  '0017119025',
  'CL',
  'Expired',
  '12/30/2009'],
 ['CMA390',
  'L000006323',
  'W. Stephen Cannon, Management Trustee',
  '0017119025',
  'CL',
  'Canceled',
  '12/31/2010'],
 ['CMA390',
  'L000006605',
  'W. St