# FCC CMA Scraping

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [4]:
from bs4 import BeautifulSoup
import time

In [5]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

In [6]:
import os

## Test Search (CMA001)

* https://stackabuse.com/python-check-if-a-file-or-directory-exists/
* template for `scrape_table_for_CMA()`

In [162]:
if os.path.isfile("./test-search.html"):
    print("test-search.html exists")

else:
    print("LOG[INFO]: Running Test Search")

    # Scrape Test Search results page for test market
    mkt_num = "CMA001"

    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)

    # spin up the webdriver
    firefox = webdriver.Firefox(executable_path="./geckodriver")
    firefox.get("https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp")

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))

    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))

    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_num)

    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()
    WebDriverWait(firefox, 15).until(
        EC.url_changes("https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete")

    # scrape testhtml soup
    testHTML = firefox.execute_script("return document.body.innerHTML")

    with open("test-search.html", "w") as f:
        f.write(testHTML)
    
    firefox.quit()

test-search.html exists


In [150]:
# making the soup, finding the results table
with open("test-search.html", "r") as f:
    testHTML = f.read()

soup = BeautifulSoup(testHTML, "lxml")
table_page = soup.find("table", {"summary": "License search results"})

In [168]:
# finding search col names
first_row_cols = table_page.find_all("tr")[0].find_all("th")
search_cols = []
for i in range(0, len(first_row_cols)):
    cell_data = first_row_cols[i].get_text().strip()
    search_cols.append(cell_data)
print(search_cols)

['Call Sign/Lease ID', 'Name', 'FRN', 'Radio Service', 'Status', 'Expiration Date']


In [169]:
# defining extracted table column names
col_names = ['Market Number', 'Call Sign/Lease ID', 'FRN', 'Radio Service', 'Status', 'Expiration Date']

## Define functions for scraping individual CMA

* https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#arg-attrs
* https://stackoverflow.com/questions/4768941/how-to-break-a-line-of-chained-methods-in-python

In [7]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [8]:
def extract_mkt_code(soup):
    '''
    Extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [320]:
def scrape_table_for_cma(mkt_code, without_browser=False):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    #firefox.quit()
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
        
    tock = time.time()
    print("LOG[INFO]: END %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return table_data

In [10]:
extract_table_data(innerHTML)

NameError: name 'innerHTML' is not defined

In [337]:
scrape_table_for_cma("CMA002", True)

LOG[INFO]: START CMA002 


AttributeError: 'NoneType' object has no attribute 'find_all'

## Scraping results tables for all CMA's

* https://stackoverflow.com/questions/2050637/appending-the-same-string-to-a-list-of-strings-in-python
* https://www.csestack.org/python-padding-number-string/
* https://stackoverflow.com/questions/18265935/python-create-list-with-numbers-between-2-values

NOTE: loops timed out, resulting in multiple search attempts:
* Attempt 1: entire 734 CMAs; timed out at CMA325
* Attempt 2: remaining CMAs; timed out at CMA461
* Attempt 3: 

### Creating search lists

In [59]:
cma_count = 734
cma_range = list(range(1,cma_count + 1))    # 734 CMA's in total
cma_search_list = ['CMA' + str(s).zfill(3) for s in cma_range]
assert(len(cma_search_list) == 734)

cma_search_1 = cma_search_list[0:234]
cma_search_2 = cma_search_list[234:460]

In [102]:
cma_searchs = []
cma_searchs.append(cma_search_1)  # first attempt TimeoutException at CMA235
cma_searchs.append(cma_search_2)  # second attempt TimeoutException at CMA461

start = 460  # last succesful search
chunk_len = 150

# create searchs of length == chunk_len
while start <= (len(cma_search_list) - chunk_len):
    print(start)
    end = start + chunk_len
    cma_searchs.append(cma_search_list[start : end])
    start = end

print(start)
cma_searchs.append(cma_search_list[start:])

assert(sum(list(map(len, cma_searchs)))) == 734  # check searchs add up to 734

460
610


* https://www.afternerd.com/blog/append-vs-extend/
* https://datascience.stackexchange.com/questions/26333/convert-a-list-of-lists-into-a-pandas-dataframe

### Running searchs -- incl. timeouts
!-- Don't rerun this section --!

In [258]:
break

SyntaxError: 'break' outside loop (<ipython-input-258-6aaf1f276005>, line 1)

In [20]:
cma_data = []

for mkt_code in cma_search_list:
    cma_data.extend(
    scrape_table_for_cma(mkt_code, True))

LOG[INFO]: START CMA001 
LOG[INFO]: END CMA001 (13.518734 seconds)
LOG[INFO]: START CMA002 
LOG[INFO]: END CMA002 (13.619050 seconds)
LOG[INFO]: START CMA003 
LOG[INFO]: END CMA003 (16.284943 seconds)
LOG[INFO]: START CMA004 
LOG[INFO]: END CMA004 (14.117703 seconds)
LOG[INFO]: START CMA005 
LOG[INFO]: END CMA005 (16.986397 seconds)
LOG[INFO]: START CMA006 
LOG[INFO]: END CMA006 (13.616145 seconds)
LOG[INFO]: START CMA007 
LOG[INFO]: END CMA007 (18.761345 seconds)
LOG[INFO]: START CMA008 
LOG[INFO]: END CMA008 (14.871606 seconds)
LOG[INFO]: START CMA009 
LOG[INFO]: END CMA009 (13.199400 seconds)
LOG[INFO]: START CMA010 
LOG[INFO]: END CMA010 (13.412127 seconds)
LOG[INFO]: START CMA011 
LOG[INFO]: END CMA011 (14.319657 seconds)
LOG[INFO]: START CMA012 
LOG[INFO]: END CMA012 (14.939872 seconds)
LOG[INFO]: START CMA013 
LOG[INFO]: END CMA013 (20.347147 seconds)
LOG[INFO]: START CMA014 
LOG[INFO]: END CMA014 (23.638111 seconds)
LOG[INFO]: START CMA015 
LOG[INFO]: END CMA015 (13.886712 seco

LOG[INFO]: END CMA123 (18.032386 seconds)
LOG[INFO]: START CMA124 
LOG[INFO]: END CMA124 (16.361427 seconds)
LOG[INFO]: START CMA125 
LOG[INFO]: END CMA125 (23.105183 seconds)
LOG[INFO]: START CMA126 
LOG[INFO]: END CMA126 (18.618931 seconds)
LOG[INFO]: START CMA127 
LOG[INFO]: END CMA127 (14.693156 seconds)
LOG[INFO]: START CMA128 
LOG[INFO]: END CMA128 (25.877398 seconds)
LOG[INFO]: START CMA129 
LOG[INFO]: END CMA129 (18.279821 seconds)
LOG[INFO]: START CMA130 
LOG[INFO]: END CMA130 (14.028654 seconds)
LOG[INFO]: START CMA131 
LOG[INFO]: END CMA131 (26.179425 seconds)
LOG[INFO]: START CMA132 
LOG[INFO]: END CMA132 (15.379086 seconds)
LOG[INFO]: START CMA133 
LOG[INFO]: END CMA133 (12.921988 seconds)
LOG[INFO]: START CMA134 
LOG[INFO]: END CMA134 (16.313694 seconds)
LOG[INFO]: START CMA135 
LOG[INFO]: END CMA135 (14.698857 seconds)
LOG[INFO]: START CMA136 
LOG[INFO]: END CMA136 (18.790799 seconds)
LOG[INFO]: START CMA137 
LOG[INFO]: END CMA137 (13.887054 seconds)
LOG[INFO]: START CMA

TimeoutException: Message: 


In [68]:
tick = time.time()
print("LOG[INFO]: START LOOP from %s " % cma_search_2[0])

i = 0
for mkt_code in cma_search_2:
    cma_data.extend(
    scrape_table_for_cma(mkt_code, True))
    i += 1
    
tock = time.time()
print("LOG[INFO]: END LOOP at %s (%2f seconds)" % (cma_search_2[i], tock - tick))

LOG[INFO]: START LOOP from CMA235 
LOG[INFO]: START CMA235 
LOG[INFO]: END CMA235 (18.818759 seconds)
LOG[INFO]: START CMA236 
LOG[INFO]: END CMA236 (24.263375 seconds)
LOG[INFO]: START CMA237 
LOG[INFO]: END CMA237 (24.357678 seconds)
LOG[INFO]: START CMA238 
LOG[INFO]: END CMA238 (18.174001 seconds)
LOG[INFO]: START CMA239 
LOG[INFO]: END CMA239 (16.489651 seconds)
LOG[INFO]: START CMA240 
LOG[INFO]: END CMA240 (19.618635 seconds)
LOG[INFO]: START CMA241 
LOG[INFO]: END CMA241 (19.805890 seconds)
LOG[INFO]: START CMA242 
LOG[INFO]: END CMA242 (26.273866 seconds)
LOG[INFO]: START CMA243 
LOG[INFO]: END CMA243 (14.695370 seconds)
LOG[INFO]: START CMA244 
LOG[INFO]: END CMA244 (14.921892 seconds)
LOG[INFO]: START CMA245 
LOG[INFO]: END CMA245 (14.419010 seconds)
LOG[INFO]: START CMA246 
LOG[INFO]: END CMA246 (21.834631 seconds)
LOG[INFO]: START CMA247 
LOG[INFO]: END CMA247 (14.235567 seconds)
LOG[INFO]: START CMA248 
LOG[INFO]: END CMA248 (15.561186 seconds)
LOG[INFO]: START CMA249 
LO

LOG[INFO]: END CMA357 (25.276809 seconds)
LOG[INFO]: START CMA358 
LOG[INFO]: END CMA358 (14.931305 seconds)
LOG[INFO]: START CMA359 
LOG[INFO]: END CMA359 (14.858395 seconds)
LOG[INFO]: START CMA360 
LOG[INFO]: END CMA360 (14.138118 seconds)
LOG[INFO]: START CMA361 
LOG[INFO]: END CMA361 (14.662134 seconds)
LOG[INFO]: START CMA362 
LOG[INFO]: END CMA362 (14.312838 seconds)
LOG[INFO]: START CMA363 
LOG[INFO]: END CMA363 (13.845055 seconds)
LOG[INFO]: START CMA364 
LOG[INFO]: END CMA364 (14.044773 seconds)
LOG[INFO]: START CMA365 
LOG[INFO]: END CMA365 (14.128574 seconds)
LOG[INFO]: START CMA366 
LOG[INFO]: END CMA366 (15.980313 seconds)
LOG[INFO]: START CMA367 
LOG[INFO]: END CMA367 (17.497996 seconds)
LOG[INFO]: START CMA368 
LOG[INFO]: END CMA368 (18.599595 seconds)
LOG[INFO]: START CMA369 
LOG[INFO]: END CMA369 (15.173782 seconds)
LOG[INFO]: START CMA370 
LOG[INFO]: END CMA370 (14.954733 seconds)
LOG[INFO]: START CMA371 
LOG[INFO]: END CMA371 (13.051711 seconds)
LOG[INFO]: START CMA

TimeoutException: Message: Timeout loading page after 300000ms


In [69]:
print(i)  # 226; successful searchs in attempt two

226


In [392]:
def loop_scrape_table_for_cma(list_cma):
    
    # start search counter
    i = 0
    # log start of loop
    tick = time.time()
    print(
        "LOG[INFO]: START TIME is %s\n"
        "LOG[INFO]: START LOOP from %s"
        % (time.strftime("%I:%M %p"), list_cma[i])
    )
    
    # begin search & scrape loop over mkt_codes
    loop_data = []
    
    # succcessful loop
    try:
        while i < len(list_cma):
            mkt_code = list_cma[i]
            loop_data.extend(
                scrape_table_for_cma(mkt_code, True)
            )
            i += 1
            # wait random time before next query
            wait_secs = random.randint(1,60)
            time.sleep(wait_secs)
        # log end of loop
        tock = time.time()
        print(
            "LOG[INFO]: END LOOP at %s\n"
            "LOG[INFO]: END TIME is %s (TOTAL: %2f seconds)"
            % (list_cma[(i)], time.strftime("%I:%M %p"), tock - tick)
        )
    
    # interrupted loop logs time & raises error
    except Exception as e:
        tock = time.time()
        print(
            "LOG[INFO]: LOOP INTERRUPTED at %s\n"
            "LOG[INFO]: END TIME is %s (TOTAL: %2f seconds)\n" 
            % (list_cma[i],
               time.strftime("%I:%M %p"), tock - tick
              )
        )
        print(e)
        
    # always return any scraped data, regardless of interruptions
    finally:
        return (loop_data, i)


In [146]:
for cma_list in cma_searchs[2:]:
    print(len(cma_list), cma_list[0], cma_list[-1])
    cma_data.append(
        loop_scrape_table_for_cma(cma_list)
    )

150 CMA461 CMA610
LOG[INFO]: START LOOP from CMA461 
LOG[INFO]: START CMA461 
LOG[INFO]: END CMA461 (19.466008 seconds)
LOG[INFO]: START CMA462 
LOG[INFO]: END CMA462 (14.899654 seconds)
LOG[INFO]: START CMA463 
LOG[INFO]: END CMA463 (14.293337 seconds)
LOG[INFO]: START CMA464 
LOG[INFO]: END CMA464 (14.263479 seconds)
LOG[INFO]: START CMA465 
LOG[INFO]: END CMA465 (13.906574 seconds)
LOG[INFO]: START CMA466 
LOG[INFO]: END CMA466 (14.555156 seconds)
LOG[INFO]: START CMA467 
LOG[INFO]: END CMA467 (14.355934 seconds)
LOG[INFO]: START CMA468 
LOG[INFO]: END CMA468 (14.929456 seconds)
LOG[INFO]: START CMA469 
LOG[INFO]: END CMA469 (14.916506 seconds)
LOG[INFO]: START CMA470 
LOG[INFO]: END CMA470 (15.082025 seconds)
LOG[INFO]: START CMA471 
LOG[INFO]: END CMA471 (13.842560 seconds)
LOG[INFO]: START CMA472 
LOG[INFO]: END CMA472 (14.829918 seconds)
LOG[INFO]: START CMA473 
LOG[INFO]: END CMA473 (21.076899 seconds)
LOG[INFO]: START CMA474 
LOG[INFO]: END CMA474 (13.320583 seconds)
LOG[INFO]

In [221]:
for cma_list in (cma_search_list[567:650], cma_search_list[650:]):
    print(len(cma_list), cma_list[0], cma_list[-1])
    cma_data.append(
        loop_scrape_table_for_cma(cma_list)
    )
    time.sleep(300)

83 CMA568 CMA650
LOG[INFO]: START LOOP from CMA568 
LOG[INFO]: START CMA568 
LOG[INFO]: END CMA568 (17.365303 seconds)
LOG[INFO]: START CMA569 
LOG[INFO]: END CMA569 (15.064679 seconds)
LOG[INFO]: START CMA570 
LOG[INFO]: END CMA570 (15.127118 seconds)
LOG[INFO]: START CMA571 
LOG[INFO]: END CMA571 (14.304701 seconds)
LOG[INFO]: START CMA572 
LOG[INFO]: END CMA572 (14.603250 seconds)
LOG[INFO]: START CMA573 
LOG[INFO]: END CMA573 (14.723639 seconds)
LOG[INFO]: START CMA574 
LOG[INFO]: END CMA574 (14.916965 seconds)
LOG[INFO]: START CMA575 
LOG[INFO]: END CMA575 (14.429946 seconds)
LOG[INFO]: START CMA576 
LOG[INFO]: END CMA576 (14.812567 seconds)
LOG[INFO]: START CMA577 
LOG[INFO]: END CMA577 (14.135329 seconds)
LOG[INFO]: START CMA578 
LOG[INFO]: END CMA578 (13.716339 seconds)
LOG[INFO]: START CMA579 
LOG[INFO]: END CMA579 (13.813819 seconds)
LOG[INFO]: START CMA580 
LOG[INFO]: END CMA580 (15.140860 seconds)
LOG[INFO]: START CMA581 
LOG[INFO]: END CMA581 (17.002582 seconds)
LOG[INFO]:

LOG[INFO]: END CMA716 (15.473303 seconds)
LOG[INFO]: START CMA717 
LOG[INFO]: END CMA717 (13.905826 seconds)
LOG[INFO]: START CMA718 
LOG[INFO]: END CMA718 (15.087008 seconds)
LOG[INFO]: START CMA719 
LOG[INFO]: END CMA719 (20.125244 seconds)
LOG[INFO]: START CMA720 
LOG[INFO]: END CMA720 (16.075370 seconds)
LOG[INFO]: START CMA721 
LOG[INFO]: END CMA721 (16.092230 seconds)
LOG[INFO]: START CMA722 
LOG[INFO]: END CMA722 (13.914416 seconds)
LOG[INFO]: START CMA723 
LOG[INFO]: END CMA723 (14.643822 seconds)
LOG[INFO]: START CMA724 
LOG[INFO]: LOOP INTERUPTED at CMA724 (TOTAL: 1966.770223 seconds)


KeyboardInterrupt: 

In [233]:
for cma_list in (cma_search_list[622:650], cma_search_list[723:]):
    print(len(cma_list), cma_list[0], cma_list[-1])
    cma_data.append(
        loop_scrape_table_for_cma(cma_list)
    )
    print("LOG[INFO]: START 5 minute sleep")
    time.sleep(300)
    print("LOG[INFO]: CONTINUE to NEXT LIST")

28 CMA623 CMA650
LOG[INFO]: START TIME is 09:47 AM
LOG[INFO]: START LOOP from CMA623
LOG[INFO]: START CMA623 
LOG[INFO]: END CMA623 (17.373029 seconds)
LOG[INFO]: START CMA624 
LOG[INFO]: END CMA624 (13.507266 seconds)
LOG[INFO]: START CMA625 
LOG[INFO]: END CMA625 (15.753309 seconds)
LOG[INFO]: START CMA626 
LOG[INFO]: END CMA626 (17.415163 seconds)
LOG[INFO]: START CMA627 
LOG[INFO]: END CMA627 (17.088742 seconds)
LOG[INFO]: START CMA628 
LOG[INFO]: END CMA628 (16.648390 seconds)
LOG[INFO]: START CMA629 
LOG[INFO]: END CMA629 (14.144469 seconds)
LOG[INFO]: START CMA630 
LOG[INFO]: END CMA630 (20.557998 seconds)
LOG[INFO]: START CMA631 
LOG[INFO]: END CMA631 (15.033135 seconds)
LOG[INFO]: START CMA632 
LOG[INFO]: END CMA632 (16.207628 seconds)
LOG[INFO]: START CMA633 
LOG[INFO]: END CMA633 (14.032130 seconds)
LOG[INFO]: START CMA634 
LOG[INFO]: END CMA634 (15.928715 seconds)
LOG[INFO]: START CMA635 
LOG[INFO]: END CMA635 (13.356392 seconds)
LOG[INFO]: START CMA636 
LOG[INFO]: END CMA6

In [235]:
cma_data.append(
    loop_scrape_table_for_cma(cma_search_list[727:])
    )

LOG[INFO]: START TIME is 10:50 AM
LOG[INFO]: START LOOP from CMA728
LOG[INFO]: START CMA728 
LOG[INFO]: END CMA728 (18.184898 seconds)
LOG[INFO]: START CMA729 
LOG[INFO]: END CMA729 (14.360675 seconds)
LOG[INFO]: START CMA730 
LOG[INFO]: END CMA730 (13.599198 seconds)
LOG[INFO]: START CMA731 
LOG[INFO]: END CMA731 (15.148718 seconds)
LOG[INFO]: START CMA732 
LOG[INFO]: END CMA732 (14.039185 seconds)
LOG[INFO]: START CMA733 
LOG[INFO]: END CMA733 (13.619414 seconds)
LOG[INFO]: START CMA734 
LOG[INFO]: END CMA734 (15.987366 seconds)


In [234]:
cma_search_list[727:]

['CMA728', 'CMA729', 'CMA730', 'CMA731', 'CMA732', 'CMA733', 'CMA734']

In [220]:
len(cma_search_list[567:650])
len(cma_search_list[567:])
time.time()
print(time.ctime())
time.strftime("%I:%M %p")

Fri Apr 26 08:34:25 2019


'08:34 AM'

### Cache results of search loops to csv
!---don't rerun this section either ---!

In [259]:
break

SyntaxError: 'break' outside loop (<ipython-input-259-6aaf1f276005>, line 1)

In [38]:
import csv

In [103]:
# search attempt one results

with open("data_CMA001-CMA234.csv", newline='') as f:
    reader = csv.reader(f)
    cma001_234_data = list(reader)

len(cma001_234_data)

1743

In [105]:
# search attempt one + two results

with open("data_CMA001-CMA460.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(cma_data)

In [104]:
len(cma_data)

3630

In [176]:
# search attempt three
print(cma_data[-1][0])
print(cma_data[3630][0])
cma_data[-1][-1]

['CMA461', 'KNKN442', 'ALLTEL Corporation', '0002942159', 'CL', 'Active', '10/01/2020']
['CMA461', 'KNKN442', 'ALLTEL Corporation', '0002942159', 'CL', 'Active', '10/01/2020']


['CMA567',
 'WQXW508',
 'Advantage Spectrum, L.P.',
 '0023909104',
 'AT',
 'Active',
 '07/05/2028']

In [178]:
with open("data_CMA461-CMA567.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(cma_data[3630])

In [226]:
# search attempt four
print(cma_data[3631][0][0], cma_data[3631][-1][0])

CMA568 CMA622


In [227]:
with open("data_CMA568-CMA622.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(cma_data[3631])

In [229]:
with open("data_CMA568-CMA622.csv", newline='') as f:
    reader = csv.reader(f)
    cma568_622_data = list(reader)

assert(len(cma568_622_data) == len(cma_data[3631]))

In [230]:
# search attempt five
print(cma_data[3632][0][0], cma_data[3632][-1][0])

CMA651 CMA723


In [231]:
with open("data_CMA651-CMA723.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(cma_data[3632])

In [254]:
help(csv.writer)

Help on built-in function writer in module _csv:

writer(...)
    csv_writer = csv.writer(fileobj [, dialect='excel']
                                [optional keyword args])
        for row in sequence:
            csv_writer.writerow(row)
    
        [or]
    
        csv_writer = csv.writer(fileobj [, dialect='excel']
                                [optional keyword args])
        csv_writer.writerows(rows)
    
    The "fileobj" argument can be any object that supports the file API.



In [239]:
# search attempt six, seven, eight
print(cma_data[3633][0][0], cma_data[3633][-1][0])
print(cma_data[3634][0][0], cma_data[3634][-1][0])
print(cma_data[3635][0][0], cma_data[3635][-1][0])

CMA623 CMA650
CMA724 CMA727
CMA728 CMA734


## Combine search loops into dataframe

In [260]:
# check total number of rows scraped
len_cma_data = len(cma_data[:3629])
for l in cma_data[3630:]:
    print(len(l))
    len_cma_data += len(l)
print(len_cma_data)

882
452
604
246
26
51
5890


In [257]:
# combine all searchs in order

flat_cma = (cma_data[:3629]   # attempt one & two (CMA001-CMA460)
    + cma_data[3630][0:]     # attempt three (CMA461-CMA567)
    + cma_data[3631][0:]     # attempt four (CMA568-CMA622)
    + cma_data[3633][0:]     # attempt six (CMA623-CMA650)
    + cma_data[3632][0:]     # attempt five (CMA651-CMA723)
    + cma_data[3634][0:]     # attempt seven (CMA724-CMA727)
    + cma_data[3635][0:]    # attempt eight (CMA728-CMA734)
           )
assert(len(flat_cma) == len_cma_data)

In [261]:
import pandas as pd

In [275]:
# defining extracted table column names
# extracted names ['Market Number', 'Call Sign/Lease ID', 'FRN', 'Radio Service', 'Status', 'Expiration Date']

col_names = ['cma_no', 'callsign_leaseID', 'FRN_name', 'FRN_no', 'radio_service', 'status', 'date_expr' ]

# create dataframe as cma_df
cma_df = pd.DataFrame.from_records(flat_cma, columns=col_names)

# group by CMA & check that all CMAs were searched & scraped
assert(len(cma_df.groupby(['cma_no']).count()) == len(cma_search_list))

In [273]:
# write cma_df to csv
cma_csv_path = "./cma_df.csv"
cma_df.to_csv(cma_csv_path, index=False)

# import & check csv
cma_df1 = pd.read_csv(cma_csv_path)

assert(len(cma_df) == len(cma_df1))

## Rerun search for CMA with 10 rows
* potentially missing callsigns :(

In [336]:
rows_per_cma = cma_df.groupby(['cma_no']).count()
IS_gt_10 = (rows_per_cma['callsign_leaseID'] == 10)

cma_gt_10 = rows_per_cma[IS_gt_10].reset_index()['cma_no']
cma_gt_10_search = list(cma_gt_10)

In [393]:
loop_scrape_table_for_cma(["CMA002"])

LOG[INFO]: START TIME is 06:02 PM
LOG[INFO]: START LOOP from CMA002
LOG[INFO]: START CMA002 
LOG[INFO]: LOOP INTERRUPTED at CMA002
LOG[INFO]: END TIME is 06:02 PM (TOTAL: 22.157018 seconds)

'NoneType' object has no attribute 'find_all'


([], 0)

RuntimeError: No active exception to reraise

In [1]:
def multitry_search(search_list):
    # initialise counters & return objects
    out_list = []

    tick = time.time()

    j = 0
    tries = 1

    # Try looping scrape over search list up to 3 times
    while (j < len(search_list) and tries <= 3):
        try:
            loop_out = loop_scrape_table_for_cma(search_list[j:])
        except Exception as e:
            # update search start index and no. of tries
            print("ERROR:", e.args)
            # wait random time before next try
            wait_sec = random.randint(150,500)
            time.sleep(wait_sec)
        finally:
            out_list.append(loop_out[0])
            tries += 1
            j += loop_out[1]

    tock = time.time()

    print("LOG[INFO]: Searched (%s + 1) mkt_codes out of %s total\n"
          "LOG[INFO]: TOTAL TIME for %s attempts was %2f seconds"
          % (j, len(search_list), tries, tock-tick)
         )
    
    return out_list

In [None]:
cma_amend = []
break

In [382]:
list = [2, 3, 4, 6, 7]
test1 = []
j = 0
tries = 1
out = []
print(len(out))
while (j < len(list) and tries <= 3):
    try:
        loop_out = ([1,1,1],2)
        print(x)
    except Exception as e:
        print("in except")
        j += loop_out[1]
        tries += 1
        print("j = %s, tries = %s" % (j, tries))
        print('ERROR:', e.args)
    finally:
        print("in finally")
        out.append(loop_out[0])
out

0
in except
j = 2, tries = 2
ERROR: ("name 'x' is not defined",)
in finally
in except
j = 4, tries = 3
ERROR: ("name 'x' is not defined",)
in finally
in except
j = 6, tries = 4
ERROR: ("name 'x' is not defined",)
in finally


[[1, 1, 1], [1, 1, 1], [1, 1, 1]]

## Filter list of Call Signs from cma_df

* filter for col4: 'Radio Service' == 'CL'
```python
is_CL = cma_df['Radio Service'] == 'CL'
callsign_CL = cma_df.loc[[is_CL], 1]
```
* 

In [None]:
import pandas as pd

In [308]:
# import cma_df from csv
cma_df = pd.read_csv(cma_csv_path, dtype=object)

# filter for Cellular Licenses only
is_CL = (cma_df['radio_service'] == 'CL')
CL_df = cma_df[is_CL]

# create list of Cellular (CL) callsigns
CL_callsign = CL_df['callsign_leaseID']
CL_callsign.head()

0    KNKA206
1    KNKA310
7    KNKA209
8    KNKA351
9    KNKR294
Name: callsign_leaseID, dtype: object

In [309]:
# write list of Cellular Callsigns to csv
CL_callsign.to_csv("CL_CallSigns.csv", index=False) 


In [310]:
! open CL_CallSigns.csv

In [306]:
print(len(callsign_CL.unique()),
      len(callsign_CL)
     )

callsign_CL.head()

2445 2445


0    KNKA206
1    KNKA310
7    KNKA209
8    KNKA351
9    KNKR294
Name: callsign_leaseID, dtype: object

#### --- ignore test code ---

In [347]:
def try_test():
    y = 5
    try:
        print("x is %s" % x)
    except:
        print("error")
        time.sleep(0)
        raise
    # comment
    finally:
        print(
            "blahblah %s\n" % time.ctime()
            "something" 
        )

SyntaxError: invalid syntax (<ipython-input-347-f4b5d2edc7e2>, line 13)

In [217]:
try_test()

error
blahblah Fri Apr 26 08:25:17 2019
something


NameError: name 'x' is not defined