# FCC CMA Scraping (AMEND)

## NOTE: This is a re-run of scraping functions for CMA's with results equal to 10
* see scrape-callsigns_by_cma.ipynb for initial searchs

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [39]:
from bs4 import BeautifulSoup
import time

In [40]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

## Define functions for scraping individual CMA

In [3]:
def extract_mkt_code(soup):
    '''
    Extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [4]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [5]:
def scrape_table_for_cma(mkt_code, without_browser=False):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START SCRAPE %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    #firefox.quit()
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
        
    tock = time.time()
    print("LOG[INFO]: END SCRAPE %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return table_data

In [6]:
scrape_table_for_cma("CMA002", True)

LOG[INFO]: START SCRAPE CMA002 


AttributeError: 'NoneType' object has no attribute 'find_all'

## Define functions for searching for list of CMAs

In [23]:
import random

In [33]:
def loop_scrape_table_for_cma(list_cma):
    
    # start search & tries counter
    i = 0
    tries = 1
    
    # log start of loop
    tick = time.time()
    print(
        "LOG[INFO]: START TIME is %s\n"
        "LOG[INFO]: START LOOP from %s"
        % (time.strftime("%I:%M %p"), list_cma[i])
    )
    
    # begin search & scrape loop over mkt_codes
    loop_data = []
    
    # Attempt to scrape all cma's in list_cma up to 3 times
    while (i < len(list_cma) and tries <=3):
        # Succcessful iteration
        try:
            print("LOG[INFO]: Attempt", tries)
            # search for i-th CMA
            mkt_code = list_cma[i]
            loop_data.append(
                scrape_table_for_cma(mkt_code, True)
            )
            i += 1
            # wait random time before next query
            wait_query = random.randint(1,60)
            print("LOG[INFO]: WAIT %2f seconds before next query" % wait_query)
            time.sleep(wait_query)
    
        # Interrupted iteration logs time & error, and waits for retry
        except Exception as e:
            print(
                "LOG[ERROR]: %s\n"
                "LOG[INFO]: LOOP INTERRUPTED at %s\n"
                % (e, time.strftime("%I:%M %p"))
            )
            tries += 1
            wait_retry = random.randint(200,500)
            print("LOG[INFO]: WAIT %2f minutes to retry" % (wait_retry/60))
            time.sleep(wait_retry)
            pass
       
    # log end of while loop
    tock = time.time()
    print(
        "LOG[INFO]: END LOOP at %s with %s ATTEMPTS\n"
        "LOG[INFO]: END TIME is %s (TOTAL: %2f minutes)"
        % (list_cma[(i)], (tries-1), time.strftime("%I:%M %p"), ((tock - tick)/60))
    )
    
    return (loop_data)


## Scraping results tables for list of CMA's

### Creating search lists

In [34]:
import pandas as pd

In [42]:
# load previous search results
cma_df = pd.read_csv("cma_df.csv", dtype='object')

# select for CMA with count >= 10, and return as list
rows_per_cma = cma_df.groupby(['cma_no']).count()
IS_gt_10 = (rows_per_cma['callsign_leaseID'] == 10)
cma_gt_10 = rows_per_cma[IS_gt_10].reset_index()['cma_no']
cma_gt_10_search = cma_gt_10.tolist()
cma_gt_10_search

['CMA002',
 'CMA014',
 'CMA015',
 'CMA024',
 'CMA026',
 'CMA028',
 'CMA029',
 'CMA039',
 'CMA041',
 'CMA045',
 'CMA053',
 'CMA071',
 'CMA074',
 'CMA077',
 'CMA081',
 'CMA086',
 'CMA089',
 'CMA090',
 'CMA097',
 'CMA098',
 'CMA101',
 'CMA106',
 'CMA113',
 'CMA127',
 'CMA128',
 'CMA141',
 'CMA152',
 'CMA161',
 'CMA162',
 'CMA165',
 'CMA166',
 'CMA168',
 'CMA171',
 'CMA174',
 'CMA176',
 'CMA181',
 'CMA185',
 'CMA187',
 'CMA188',
 'CMA217',
 'CMA221',
 'CMA227',
 'CMA234',
 'CMA245',
 'CMA246',
 'CMA255',
 'CMA264',
 'CMA267',
 'CMA268',
 'CMA273',
 'CMA274',
 'CMA281',
 'CMA283',
 'CMA285',
 'CMA286',
 'CMA289',
 'CMA290',
 'CMA295',
 'CMA297',
 'CMA298',
 'CMA299',
 'CMA306',
 'CMA307',
 'CMA311',
 'CMA313',
 'CMA315',
 'CMA316',
 'CMA317',
 'CMA318',
 'CMA319',
 'CMA320',
 'CMA322',
 'CMA323',
 'CMA336',
 'CMA337',
 'CMA341',
 'CMA342',
 'CMA344',
 'CMA348',
 'CMA351',
 'CMA352',
 'CMA353',
 'CMA354',
 'CMA355',
 'CMA356',
 'CMA361',
 'CMA362',
 'CMA363',
 'CMA367',
 'CMA369',
 'CMA374',

In [32]:
# test loop
loop_scrape_table_for_cma(["CMA002"])

LOG[INFO]: START TIME is 06:51 PM
LOG[INFO]: START LOOP from CMA002
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA002 
LOG[ERROR]: 'NoneType' object has no attribute 'find_all'
LOG[INFO]: LOOP INTERRUPTED at 06:51 PM

LOG[INFO]: WAIT 0.16666666666666666 minutes to retry
LOG[INFO]: END LOOP at CMA002, 4 ATTEMPTS
LOG[INFO]: END TIME is 06:51 PM (TOTAL: 0.563114 minutes)


[]

## Filter list of Call Signs from cma_df

* filter for col4: 'Radio Service' == 'CL'
```python
is_CL = cma_df['Radio Service'] == 'CL'
callsign_CL = cma_df.loc[[is_CL], 1]
```
* 

In [None]:
import pandas as pd

In [308]:
# import cma_df from csv
cma_df = pd.read_csv(cma_csv_path, dtype=object)

# filter for Cellular Licenses only
is_CL = (cma_df['radio_service'] == 'CL')
CL_df = cma_df[is_CL]

# create list of Cellular (CL) callsigns
CL_callsign = CL_df['callsign_leaseID']
CL_callsign.head()

0    KNKA206
1    KNKA310
7    KNKA209
8    KNKA351
9    KNKR294
Name: callsign_leaseID, dtype: object

In [309]:
# write list of Cellular Callsigns to csv
CL_callsign.to_csv("CL_CallSigns.csv", index=False) 


In [310]:
! open CL_CallSigns.csv

In [306]:
print(len(callsign_CL.unique()),
      len(callsign_CL)
     )

callsign_CL.head()

2445 2445


0    KNKA206
1    KNKA310
7    KNKA209
8    KNKA351
9    KNKR294
Name: callsign_leaseID, dtype: object

#### --- ignore test code ---

In [22]:
list = [2, 3, 4, 6, 7]
test1 = []
j = 0
tries = 1
out = []

while (j < len(list) and tries <= 3):
    print("Attempt", tries)
    try:
        loop_out = ([1,1,1],2)
        print(x)
    except Exception as e:
        print("in except")
        print('ERROR:', e.args)
    finally:
        print("in finally")
        out.append(loop_out[0])
        j += loop_out[1]
        tries += 1
        print("j = %s, tries = %s" % (j, tries))
out

Attempt 1
in except
ERROR: ("name 'x' is not defined",)
in finally
j = 2, tries = 2
Attempt 2
in except
ERROR: ("name 'x' is not defined",)
in finally
j = 4, tries = 3
Attempt 3
in except
ERROR: ("name 'x' is not defined",)
in finally
j = 6, tries = 4


[[1, 1, 1], [1, 1, 1], [1, 1, 1]]

In [1]:
def multitry_search(search_list):
    # initialise counters & return objects
    out_list = []

    tick = time.time()

    j = 0
    tries = 1

    # Try looping scrape over search list up to 3 times
    while (j < len(search_list) and tries <= 3):
        print("LOG[INFO]: Attempt 1")
        try:
            loop_out = loop_scrape_table_for_cma(search_list[j:])
        except Exception as e:
            # update search start index and no. of tries
            print("ERROR:", e.args)
            # wait random time before next try
            wait_sec = random.randint(150,500)
            time.sleep(wait_sec)
        finally:
            out_list.append(loop_out[0])
            tries += 1
            j += loop_out[1]

    tock = time.time()

    print("LOG[INFO]: Searched (%s + 1) mkt_codes out of %s total\n"
          "LOG[INFO]: TOTAL TIME for %s attempts was %2f seconds"
          % (j, len(search_list), tries, tock-tick)
         )
    
    return out_list

In [347]:
def try_test():
    y = 5
    try:
        print("x is %s" % x)
    except:
        print("error")
        time.sleep(0)
        raise
    # comment
    finally:
        print(
            "blahblah %s\n" % time.ctime()
            "something" 
        )

SyntaxError: invalid syntax (<ipython-input-347-f4b5d2edc7e2>, line 13)

In [217]:
try_test()

error
blahblah Fri Apr 26 08:25:17 2019
something


NameError: name 'x' is not defined