# [ARCHIVE] FCC CMA Scraping - AMENDMENT

## NOTE

This is a re-run of scraping for CMA's with results equal to 10, due to ommission of pagination in first run
* See scrape-callsigns_by_cma.ipynb for initial searchs
* See REF_ULS-LicenseDatabase-scraping.ipynb for reference code

## Set up selenium webdriver with Firefox
* Download Firefox driver [geckodriver](https://github.com/mozilla/geckodriver/releases) for selenium
* Move the driver executable into the same directory as this notebook (or update the path to the executable in `scrape_table_for_call_sign`)
* `pip install selenium`

-- see scrape-apps_by_callsign.ipynb for more details

In [1]:
from bs4 import BeautifulSoup
import time
import random
import csv

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options  
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException        

In [3]:
import os
import pandas as pd

## Define functions for scraping individual CMA

In [None]:
def extract_mkt_code(soup):
    '''
    Helper function that extracts the market code (CMA) from market search results page
    '''
    
    # tag attributes for element containing "Market Code = " 
    mkt_code_attrs = {"align" : "left", "class" : "cell-pri-light", "valign" : "bottom"}
    
    # extract mkt code as text
    mkt_code = (
        soup.find_all(attrs=mkt_code_attrs, limit=1)[0]
        .find_all("b")[0]
        .get_text()
    )
    
    return mkt_code

In [5]:
def extract_table_data(innerHTML):
    '''
    Extracts the following details by row from innerHTML of results page:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    Returns as list of rows (as list) without col_names
    '''
    
    
    # Extract search data from innerHTML soup
    soup = BeautifulSoup(innerHTML, "lxml")
    CMA = extract_mkt_code(soup)
    
    # extract search results as list of rows
    rows = (
        soup.find("table", {"summary": "License search results"})
        .find_all("tr")[1:-1] # exclude header/footer name rows
        )

    
    # extract results by row into table, attaching cma number
    table_data = [] 
    
    for row in rows:
        
        row_data = [CMA, ]          
        cols = row.find_all("td")
        
        # extract data from each cell in a row
        for i in range(1, len(cols)):  # skip first index num column
            cell_data = cols[i].get_text().strip()
            row_data.append(cell_data)
        
        table_data.append(row_data)
    
    return(table_data)

In [14]:
def cache_innerHTML(mkt_code, without_browser=False, cache_to='search-cache/html/'):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to 'search-cache/mkt_code.html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''

    tick = time.time()
    print("LOG[INFO]: START SCRAPE %s " % mkt_code)

          
    # driver config - start in private
    firefox_profile = webdriver.FirefoxProfile()
    firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
    
    # (optional) don't open up browser
    firefox_options = Options()
    if without_browser:
        firefox_options.add_argument("--headless")

    # direct webdriver to search page
    search_url = "https://wireless2.fcc.gov/UlsApp/UlsSearch/searchMarket.jsp"
    
    firefox = webdriver.Firefox(options=firefox_options, executable_path="./geckodriver")
    firefox.get(search_url)

    # specify the market type we want to search for
    in_markettype = Select(firefox.find_element_by_id("marketType"))
    in_markettype.select_by_value("CMA")

    # specify the market we want to search for
    in_mkt = Select(firefox.find_element_by_id("ulsMarket"))
    in_mkt.deselect_by_visible_text("All")
    in_mkt.select_by_value(mkt_code)
    
    # set pagination to 100 (maximum)
    in_pagination = Select(firefox.find_element_by_name("fiRowsPerPage"))
    in_pagination.select_by_value("100")
        
    # submit search, wait for load
    firefox.find_element_by_name("marketSearch").submit()

    WebDriverWait(firefox, 15).until(EC.url_changes(search_url))
    WebDriverWait(firefox, 15).until(
        lambda _: firefox.execute_script("return document.readyState") == "complete"
        )
    
    # scrape innerHTML & cache
    innerHTML = firefox.execute_script("return document.body.innerHTML")
    
    with open("%s/%s.html" % (cache_to, mkt_code), 'w') as f:
        f.write(innerHTML)
    
    # quit webdriver
    firefox.quit()
        
    tock = time.time()
    print("LOG[INFO]: END SCRAPE %s (%2f seconds)" % (mkt_code, tock - tick))
    
    return innerHTML

In [7]:
def scrape_table_for_cma(mkt_code, without_browser=False, cache_to='search-cache'):
    '''
    Scrapes the FCC License Search database for a given Cellular Market Area (mkt_code)
    Caches innerHTML to 'search-cache/mkt_code.html'.
    Returns list of rows (as list) without col_names:
        * Market Code (CMA)
        * Call Sign/Lease ID
        * FRN Name
        * FRN Number
        * Radio Service
        * Status
        * Expiration Date
    '''
    
    # spin up webdriver & query database
    innerHTML = cache_innerHTML(mkt_code, without_browser, cache_to)
    
    # extract table data 
    table_data = []

    table_data += extract_table_data(innerHTML)
    
    return table_data

## Define functions for searching for list of CMAs

In [8]:
def loop_scrape_table_for_cma(list_cma, max_tries=3, wait_query=True, wait_retry=True):
    '''
    Attempts to scrape for cma codes in `list_cma` up to `max_tries`. 
    Caches .html for each search.
    Returns all data in loop
    '''
    try:
        # start search & tries counter
        i = 0
        tries = 1

        # log start of loop
        tick = time.time()
        print(
            "LOG[INFO]: START TIME is %s\n"
            "LOG[INFO]: START LOOP from %s"
            % (time.strftime("%I:%M %p"), list_cma[i])
        )

        # begin search & scrape loop over mkt_codes
        loop_data = []

        # Attempt to scrape all cma's in list_cma up to 3 times
        while (i < len(list_cma) and tries <= max_tries):
            print("LOG[INFO]: Attempt", tries)
            # Succcessful iteration
            try:
                # search for i-th CMA
                mkt_code = list_cma[i]
                result = scrape_table_for_cma(mkt_code, True)

                loop_data += result
            
                # wait random time before next query
                i += 1
                waitTime_query = (
                    (random.randint(1,60)) 
                    if wait_query == True 
                    else 0
                )
                print("LOG[INFO]: WAIT %2f seconds before next query" % waitTime_query)
                time.sleep(waitTime_query)

            # Interrupted iteration logs time & error, and waits for retry
            except Exception as e:
                print(
                    "LOG[ERROR]: %s\n"
                    "LOG[INFO]: LOOP INTERRUPTED at %s\n"
                    % (e, time.strftime("%I:%M %p"))
                )
                # wait for next try
                tries += 1
                waitTime_retry = (
                    (random.randint(200,500)) 
                    if wait_retry == True 
                    else 0
                )
                if tries <= max_tries:
                    print("LOG[INFO]: WAIT %2f minutes to retry" % (waitTime_retry/60))
                    time.sleep(waitTime_retry)
                else:
                    pass

        # log end of while loop
        tock = time.time()
        print(
            "LOG[INFO]: END LOOP at %s with %s ATTEMPTS\n"
            "LOG[INFO]: END TIME is %s (TOTAL: %2f minutes)"
            % (list_cma[(i-1)], (tries), time.strftime("%I:%M %p"), ((tock - tick)/60))
        )
    
    finally:
        return (loop_data)


In [29]:
def cache_to_csv(lst, path):

    with open(path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(lst)

    with open(path, 'r', newline='') as f:
        reader = csv.reader(f)
        check = list(reader)

    assert(len(lst) == len(check))
    
    print('Wrote `lst` to path: %s' % path)

## Scraping results tables for list of CMA's

### Creating search lists

In [10]:
# load previous search results
cma_df = pd.read_csv("data-cache/cma_df.csv", dtype='object')

# select for CMA with count >= 10, and return as list
rows_per_cma = cma_df.groupby(['cma_no']).count()
IS_gt_10 = (rows_per_cma['callsign_leaseID'] == 10)
cma_gt_10 = rows_per_cma[IS_gt_10].reset_index()['cma_no']
cma_gt_10_search = cma_gt_10.tolist()
cma_gt_10_search

['CMA002',
 'CMA014',
 'CMA015',
 'CMA024',
 'CMA026',
 'CMA028',
 'CMA029',
 'CMA039',
 'CMA041',
 'CMA045',
 'CMA053',
 'CMA071',
 'CMA074',
 'CMA077',
 'CMA081',
 'CMA086',
 'CMA089',
 'CMA090',
 'CMA097',
 'CMA098',
 'CMA101',
 'CMA106',
 'CMA113',
 'CMA127',
 'CMA128',
 'CMA141',
 'CMA152',
 'CMA161',
 'CMA162',
 'CMA165',
 'CMA166',
 'CMA168',
 'CMA171',
 'CMA174',
 'CMA176',
 'CMA181',
 'CMA185',
 'CMA187',
 'CMA188',
 'CMA217',
 'CMA221',
 'CMA227',
 'CMA234',
 'CMA245',
 'CMA246',
 'CMA255',
 'CMA264',
 'CMA267',
 'CMA268',
 'CMA273',
 'CMA274',
 'CMA281',
 'CMA283',
 'CMA285',
 'CMA286',
 'CMA289',
 'CMA290',
 'CMA295',
 'CMA297',
 'CMA298',
 'CMA299',
 'CMA306',
 'CMA307',
 'CMA311',
 'CMA313',
 'CMA315',
 'CMA316',
 'CMA317',
 'CMA318',
 'CMA319',
 'CMA320',
 'CMA322',
 'CMA323',
 'CMA336',
 'CMA337',
 'CMA341',
 'CMA342',
 'CMA344',
 'CMA348',
 'CMA351',
 'CMA352',
 'CMA353',
 'CMA354',
 'CMA355',
 'CMA356',
 'CMA361',
 'CMA362',
 'CMA363',
 'CMA367',
 'CMA369',
 'CMA374',

In [19]:
len(cma_gt_10_search)

238

### Running searches

```python
# search loop
results = loop_scrape_table_for_cma(cma_gt_10_search[95:100])

# cache results table as csv
cache_to_csv(results, 
             path=('search-cache/csv/amend_%s_%s' 
                   % (results[0][0], results[-1][0]))
             )          
```

1. first search: CMA002-317 
2. second search: CMA318-389; 
    * only cached html for CMA318-380
    * messed up csv caching
3. third search: CMA390-722
    * cached csv after using list.extend() to flatten list
4. fourth search: CMA381-389 (with HTML caching)
    * supplementary search to extract data; correctly cached
5. fifth search: repeat CMA002-317
    * cached to csv
    
### Caching to .csv

1. messed up first and second search -- each row was a list corresponding to a CMA; needed to flatten 

2. re-cached data from html for CMA318-722

```python
import os

html_lst = sorted((os.listdir('search-cache/html'))) # lists html files for CMA318-722

amend_2 = []

for cma in html_lst:
    with open('search-cache/html/%s' %cma, 'r') as f:
        innerHTML = f
        amend2.extend(extract_table_data(innerHTML))
```

3. unpack incorrectly cached first search (CMA002-317)
ABORT -- too difficult; rerun first search above


In [18]:
! ls -1 'search-cache/html' | wc -l

     238


# amend 3 -- recache first search

In [11]:
results = loop_scrape_table_for_cma(cma_gt_10_search[:68])



LOG[INFO]: START TIME is 05:31 PM
LOG[INFO]: START LOOP from CMA002
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA002 
LOG[INFO]: END SCRAPE CMA002 (15.675442 seconds)
LOG[INFO]: WAIT 59.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA014 
LOG[INFO]: END SCRAPE CMA014 (22.533153 seconds)
LOG[INFO]: WAIT 31.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA015 
LOG[INFO]: END SCRAPE CMA015 (27.720479 seconds)
LOG[INFO]: WAIT 7.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA024 
LOG[INFO]: END SCRAPE CMA024 (17.576399 seconds)
LOG[INFO]: WAIT 8.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA026 
LOG[INFO]: END SCRAPE CMA026 (17.784219 seconds)
LOG[INFO]: WAIT 24.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA028 
LOG[INFO]: END SCRAPE CMA028 (20.945146 seconds)
LOG[INFO]: WAIT 56.000000 seconds before next query
LOG[INFO]:

LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA285 
LOG[INFO]: END SCRAPE CMA285 (29.368664 seconds)
LOG[INFO]: WAIT 23.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA286 
LOG[INFO]: END SCRAPE CMA286 (22.028807 seconds)
LOG[INFO]: WAIT 18.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA289 
LOG[INFO]: END SCRAPE CMA289 (26.486462 seconds)
LOG[INFO]: WAIT 40.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA290 
LOG[INFO]: END SCRAPE CMA290 (17.674388 seconds)
LOG[INFO]: WAIT 27.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA295 
LOG[INFO]: END SCRAPE CMA295 (25.419932 seconds)
LOG[INFO]: WAIT 3.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA297 
LOG[INFO]: END SCRAPE CMA297 (24.557877 seconds)
LOG[INFO]: WAIT 7.000000 seconds before next query
LOG[INFO]: Attempt 1
LOG[INFO]: START SCRAPE CMA298 
LOG[INFO]: END SCRAPE CMA

In [23]:
path = ('search-cache/csv/amend_%s_%s.csv' % (results[0][0], results[-1][0]))
print(path)

search-cache/csv/amend_CMA002_CMA317.csv


In [28]:
cache_to_csv(results, path)

Wrote [['CMA002', 'KNKA209', 'Los Angeles SMSA Limited Partnership', '0002963817', 'CL', 'Active', '10/01/2024'], ['CMA002', 'KNKA351', 'AT&T Mobility Spectrum LLC', '0014980726', 'CL', 'Active', '10/01/2027'], ['CMA002', 'KNKR294', 'FOLDEN, GENE A', '', 'CL', 'Terminated', ''], ['CMA002', 'KNKR313', 'LOS ANGELES SMSA LIMITED PARTNERSHIP', '0002963817', 'CL', 'Canceled', '09/24/2007'], ['CMA002', 'L000009285', 'AirTouch Cellular', '0006146468', 'WY', 'Canceled', '06/13/2019'], ['CMA002', 'L000010911', 'Cellco Partnership', '0003290673', 'AW', 'Canceled', '01/25/2014'], ['CMA002', 'L000010913', 'New Cingular Wireless PCS, LLC', '0003291192', 'WY', 'Canceled', '01/25/2014'], ['CMA002', 'L000015067', 'Screened Images, Inc.', '0021044003', 'CL', 'Active', '10/01/2027'], ['CMA002', 'L000023743', 'Screened Images, Inc.', '0021044003', 'AW', 'Active', '11/29/2021'], ['CMA002', 'WPOI449', 'Los Angeles SMSA Limited Partnership-AirTouch Cellular', '0002963817', 'CL', 'Canceled', '02/18/2009'], [

### amend 3 // FIX IT incorrectly caching data :(

In [115]:
# read each line of file in as list

with open('search-cache/csv/amend_CMA002_317.csv', 'r') as f:
    impt = f.readline()

In [24]:
! ls 'search-cache/csv'

amend_CMA002_317.csv    amend_CMA318_CMA722.csv


In [116]:
import re

test = impt.replace('","', '\n')
test = test.replace('\"','')

In [121]:
test_row = re.search('(?:\[)(.+)(?:\]\\n)', test)[1]


In [119]:
test = test.replace("\'","")

In [112]:
test_row2 = list(test.split(sep='\n'))[0]

In [123]:
split = test_row.split(",")

In [125]:
list(split)[0]

'CMA002'

In [147]:
with open('search-cache/csv/amend_CMA002_317.csv', 'r') as f:
    lst_cma_tables = f.readlines()

In [183]:
table_str = []

for table in lst_cma_tables:
    replaced_str = table.replace('","', '\n').replace('\"','').replace("\'","")
    table_str += replaced_str.split(sep='\n')



['[CMA002, KNKA209, Los Angeles SMSA Limited Partnership, 0002963817, CL, Active, 10/01/2024]',
 '[CMA002, KNKA351, AT&T Mobility Spectrum LLC, 0014980726, CL, Active, 10/01/2027]',
 '[CMA002, KNKR294, FOLDEN, GENE A, , CL, Terminated, ]',
 '[CMA002, KNKR313, LOS ANGELES SMSA LIMITED PARTNERSHIP, 0002963817, CL, Canceled, 09/24/2007]',
 '[CMA002, L000009285, AirTouch Cellular, 0006146468, WY, Canceled, 06/13/2019]',
 '[CMA002, L000010911, Cellco Partnership, 0003290673, AW, Canceled, 01/25/2014]',
 '[CMA002, L000010913, New Cingular Wireless PCS, LLC, 0003291192, WY, Canceled, 01/25/2014]',
 '[CMA002, L000015067, Screened Images, Inc., 0021044003, CL, Active, 10/01/2027]',
 '[CMA002, L000023743, Screened Images, Inc., 0021044003, AW, Active, 11/29/2021]',
 '[CMA002, WPOI449, Los Angeles SMSA Limited Partnership-AirTouch Cellular, 0002963817, CL, Canceled, 02/18/2009]',
 '[CMA002, WPWU990, AT&T Mobility LLC, 0014980726, WZ, Active, 06/13/2019]',
 '[CMA002, WQGA742, AT&T Mobility Spectru

In [195]:
row = table_str[0]
re_row = re.search('(?:\[)(.+)(?:\])', table_str[0])
re_row[1].split(',')

['CMA002',
 ' KNKA209',
 ' Los Angeles SMSA Limited Partnership',
 ' 0002963817',
 ' CL',
 ' Active',
 ' 10/01/2024']

In [194]:


fixed = []

for row in table_str:
    print(row)
    re_row = re.search('(?:\[)(.+)(?:\])', row)
    fixed += re_row[1].split(',')



[CMA002, KNKA209, Los Angeles SMSA Limited Partnership, 0002963817, CL, Active, 10/01/2024]
[CMA002, KNKA351, AT&T Mobility Spectrum LLC, 0014980726, CL, Active, 10/01/2027]
[CMA002, KNKR294, FOLDEN, GENE A, , CL, Terminated, ]
[CMA002, KNKR313, LOS ANGELES SMSA LIMITED PARTNERSHIP, 0002963817, CL, Canceled, 09/24/2007]
[CMA002, L000009285, AirTouch Cellular, 0006146468, WY, Canceled, 06/13/2019]
[CMA002, L000010911, Cellco Partnership, 0003290673, AW, Canceled, 01/25/2014]
[CMA002, L000010913, New Cingular Wireless PCS, LLC, 0003291192, WY, Canceled, 01/25/2014]
[CMA002, L000015067, Screened Images, Inc., 0021044003, CL, Active, 10/01/2027]
[CMA002, L000023743, Screened Images, Inc., 0021044003, AW, Active, 11/29/2021]
[CMA002, WPOI449, Los Angeles SMSA Limited Partnership-AirTouch Cellular, 0002963817, CL, Canceled, 02/18/2009]
[CMA002, WPWU990, AT&T Mobility LLC, 0014980726, WZ, Active, 06/13/2019]
[CMA002, WQGA742, AT&T Mobility Spectrum LLC, 0014980726, AW, Active, 11/29/2021]
[CM

TypeError: 'NoneType' object is not subscriptable

In [141]:
for row in table_str:
    lst_strings = re.findall('(?:\[)(.+)(?:\])',row)
    print(lst_strings)
    fixed.append(re.search('(?:\[)(.+)(?:\]\\n)', row))

pd.DataFrame(fixed)

['CMA002',
 ' KNKA209',
 ' Los Angeles SMSA Limited Partnership',
 ' 0002963817',
 ' CL',
 ' Active',
 ' 10/01/2024',
 '',
 'CMA002',
 ' KNKA351',
 ' AT&T Mobility Spectrum LLC',
 ' 0014980726',
 ' CL',
 ' Active',
 ' 10/01/2027',
 '',
 'CMA002',
 ' KNKR294',
 ' FOLDEN',
 ' GENE A',
 ' ',
 ' CL',
 ' Terminated',
 ' ',
 '',
 'CMA002',
 ' KNKR313',
 ' LOS ANGELES SMSA LIMITED PARTNERSHIP',
 ' 0002963817',
 ' CL',
 ' Canceled',
 ' 09/24/2007',
 '',
 'CMA002',
 ' L000009285',
 ' AirTouch Cellular',
 ' 0006146468',
 ' WY',
 ' Canceled',
 ' 06/13/2019',
 '',
 'CMA002',
 ' L000010911',
 ' Cellco Partnership',
 ' 0003290673',
 ' AW',
 ' Canceled',
 ' 01/25/2014',
 '',
 'CMA002',
 ' L000010913',
 ' New Cingular Wireless PCS',
 ' LLC',
 ' 0003291192',
 ' WY',
 ' Canceled',
 ' 01/25/2014',
 '',
 'CMA002',
 ' L000015067',
 ' Screened Images',
 ' Inc.',
 ' 0021044003',
 ' CL',
 ' Active',
 ' 10/01/2027',
 '',
 'CMA002',
 ' L000023743',
 ' Screened Images',
 ' Inc.',
 ' 0021044003',
 ' AW',
 ' Activ