# Explore FCC query results UL201932915978.txt

In [1]:
import numpy as np
import pandas as pd
from io import StringIO
import re

In [2]:
import os
os.getcwd()

'/Users/cynthiahqy/GitHub/fcc-scraping'

In [3]:
## paths
source_file = 'data-cache/UL201932915978.txt'
decoded_file = 'data-cache/decoded-UL201932915978.txt'
none_file = 'data-cache/none-UL201932915978.txt'

## References

* http://book.pythontips.com/en/latest/enumerate.html

## Decode file bytes

In [111]:
def replace_b_lineEnd(source_b):
    WINDOWS_LINE_ENDING = b'\r\n'
    UNIX_LINE_ENDING = b'\n'
    return (source_b.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING))

In [128]:
# read file to $content, and fix encoding errors
with open(source_file, 'rb') as f:
    source_b = f.read()
    f.close()

# replace line endings
source_b = replace_b_lineEnd(source_b)

# decode from bytes to str
decoded_str = source_b.decode(errors='replace')

# write str to file
with open(decoded_file, 'w') as f:
    f.write(decoded_str)
    f.close()

# check decoded file contents matches str written to file
with open(decoded_file, 'r') as f:
    assert(re.search(f.readline(), decoded_str[:150]))
    f.close()

HD|11494|||KNNA289|C|CL|||02/07/1997||||||||||||Y||||Y|||N|Y||||||||||||||10/01/2003||||||

HD|11493|||KNNA286|C|CL|||02/07/1997||||||||||||Y||||Y|||N|Y||||||||||||||10/01/2003||||||
HD|11494|||KNNA289|C|CL|||02/07/1997||||||||||||Y||||Y|||N|


## Extract info from decoded str lines in DataFrame

### Define functions to extract info from str lines

In [11]:
def calc_line_cols(lst_lines, re_delim='\|'):
    '''
    Counts regex matches of delimiters in each str line (as given by `re_delim=`, default is '|') 
    Returns list of int values.
    '''
    no_cols = []

    for line in lst_lines:
        no_cols.append(len(re.findall(re_delim, line)))

    assert(len(lst_lines) == len(no_cols))
    
    return(no_cols)

In [5]:
## CHECK match list flattening if using for len(match) > 2
def extract_callsigns(lst_lines, re_callsign='(?:\|)([A-Z]{4}\d{3}|L\d{9})'):
    '''
    Searchs for callsign-like strings, and returns the following for each line:
        * number of matches found
        * matches as list
        * matches as concatenated str, with duplicate matches collapsed 
        
    For example:
        * `|KNKN334|WPOI449|L000015067|` returns list `['KNKN334', 'WPOI449', 'L000015067']`
        * `|KNKN334|KNKN334|` returns str `'KNKN334'`
    '''
    
    # store regex results as
    no_callsigns = []
    lst = []
    callsigns = []
    
    # extract callsigns line by line 
    # index can be used to find lines of interest
    for index, line in enumerate(lst_lines):
        # regex returns list of matches
        m = re.findall(re_callsign, line)
        
        # append match results as values
        no_callsigns.append(len(m))
        lst.append(m)
        if len(m) == 0:
            callsigns.append(None)
        elif len(m) == 1:
            callsigns.append(m[0])
        elif len(m) == 2:
            if m[0] == m[1]:
                callsigns.append(m[0])
            else:
                callsigns.append(', '.join(m))
        else:
            callsigns.append(', '.join(m))
    
    # check length of appended results matches number of lines
    assert(len(lst_lines) == len(callsigns))
    
    return(no_callsigns, callsigns, lst)

In [6]:
def extract_record_type(lst_lines, re_recordType='[A-Z]{2}'):
    m_lst = []
    
    for index, line in enumerate(lst_lines):
        # regex returns first match
        m = re.search(re_recordType, line)
        
        # append match to results
        if m:
            m_lst.append(m[0])
        else:
            m_lst.append(None)
    
    assert(len(m_lst) == len(lst_lines))
    
    return(m_lst)

### Record Types

* record type is given by 2-letter code at the start of each entry 
* reference for names of columns given record type can be found:
https://www.fcc.gov/sites/default/files/public_access_database_definitions_v2.pdf
* reference for code columns in given record type can be found:
https://www.fcc.gov/sites/default/files/pubacc_uls_code_def_02162017.txt
though these appear to truncated at SH
* more information on how to read database files can be found at:
https://www.fcc.gov/sites/default/files/pubacc_intro_11122014.pdf

### Extract info into DataFrame

In [7]:
# read lines
dcd_lines = open(decoded_file, 'r').readlines()

In [13]:
line_cols = calc_line_cols(dcd_lines)

In [8]:
# extract info 
line_cols = calc_line_cols(dcd_lines)
extracted_callsigns = extract_callsigns(dcd_lines)
record_type = extract_record_type(dcd_lines)

In [14]:
# make dataframe
re_dcd_lines = pd.DataFrame(dcd_lines, columns=['str'])
re_dcd_lines['line_cols'] = line_cols  
re_dcd_lines['callsigns_no'] = extracted_callsigns[0]
re_dcd_lines['callsigns_str'] = extracted_callsigns[1]
re_dcd_lines['callsigns_lst'] = extracted_callsigns[2]
re_dcd_lines['rec_type'] = record_type

In [15]:
re_dcd_lines

Unnamed: 0,str,line_cols,callsigns_no,callsigns_str,callsigns_lst,rec_type
0,HD|11493|||KNNA286|C|CL|||02/07/1997||||||||||...,49,1,KNNA286,[KNNA286],HD
1,HD|11494|||KNNA289|C|CL|||02/07/1997||||||||||...,49,1,KNNA289,[KNNA289],HD
2,HD|11495|||KNKQ208|E|CL|06/04/1991|10/01/2000|...,49,1,KNKQ208,[KNKQ208],HD
3,HD|11496|||KNKQ210|E|CL|06/04/1991|10/01/2001|...,49,1,KNKQ210,[KNKQ210],HD
4,HD|11497|0005733417||KNKN848|A|CL|10/17/2011|1...,49,1,KNKN848,[KNKN848],HD
5,HD|11498|||KNNA275|C|CL|||02/07/1997||||||||||...,49,1,KNNA275,[KNNA275],HD
6,HD|11499|||KNNA346|E|CL||09/15/1999|12/18/1999...,49,1,KNNA346,[KNNA346],HD
7,HD|11500|||KNKQ246|C|CL|09/28/1992|10/01/2001|...,49,1,KNKQ246,[KNKQ246],HD
8,HD|11501|||KNKQ223|C|CL|02/21/1992|10/01/2001|...,49,1,KNKQ223,[KNKQ223],HD
9,HD|11502|||KNNA330|C|CL|||03/11/1997||||||||||...,49,1,KNNA330,[KNNA330],HD


## Estimate number of missed callsigns in CMA scrape

In [100]:
# extract draft list of unique callsigns from lines with 1 callsign

lst_callsigns = re_dcd_lines[re_dcd_lines['callsigns_no'] == 1]['callsigns_str'].unique()
print("Found approx %s unique callsigns in downloaded ULS query" % len(scraped_callsigns))

# compare with scraped callsigns from cma searchs

with open("data-cache/CL_CallSigns.csv", "r") as f:
    scraped_callsigns = f.readlines()
    f.close()

print("Scraped %s callsigns from cma searchs" % len(scraped_callsigns))

# estimate number of callsigns missing from scraped list

print("Missed %s callsigns in cma scrape" % (len(lst_callsigns) - len(scraped_callsigns)))

Found approx 2445 unique callsigns in downloaded ULS query
Scraped 2445 callsigns from cma searchs
Missed 310 callsigns in cma scrape


## Observations about columns per line

### number of columns
* there is a lot of variance in number of cols by line
```python
gb_cols = re_dcd_lines.groupby(by=['line_cols'])
print(gb_cols.ngroups)  # 274 different col lengths of lines
print(re_dcd_lines['line_cols'].max())  # max number of delimiters/cols is 2931-1
print(re_dcd_lines['line_cols'].min())  # min number of delim/cols is 2-1
```

* Some lines have very few cols, and tend to have empty or incomplete strings, with no callsigns:

```python
re_dcd_lines.sort_values(by='line_cols') 

In [24]: dcd_lines[1389923]
Out[24]: '\n'
```


## location of callsign value

In [87]:
dcd_lines[8650:9000]
#8657 / 8658 switch from license in column 3 to column 4

['EN|4084418|||L000032268|O|L01984978|CellBlox Acquisitions, LLC|||||||||||||||||||\n',
 'EN|4084419|||L000032269|O|L01984978|CellBlox Acquisitions, LLC|||||||||||||||||||\n',
 'EN|4101100|||L000032767|O|L01758932|Screened Images, Inc.|||||||||||||||||||\n',
 'EN|4113126|||L000033073|O|L01984978|CellBlox Acquisitions, LLC|||||||||||||||||||\n',
 'EN|4113127|||L000033074|O|L01758932|Screened Images, Inc.|||||||||||||||||||\n',
 'EN|4113128|||L000033075|O|L01758932|Screened Images, Inc.|||||||||||||||||||\n',
 'EN|4138569|||L000033684|O|L01674355|Shawntech Communications, Inc|||||||||||||||||||\n',
 'EN|4138570|||L000033685|O|L01674355|Shawntech Communications, Inc|||||||||||||||||||\n',
 'HS|11493||KNNA286|04/07/2001|LITRAN\n',
 'HS|11493||KNNA286|12/07/2001|COR\n',
 'HS|11493||KNNA286|12/07/2001|COR\n',
 'HS|11494||KNNA289|04/07/2001|LITRAN\n',
 'HS|11494||KNNA289|12/07/2001|COR\n',
 'HS|11494||KNNA289|12/07/2001|COR\n',
 'HS|11495||KNKQ208|02/02/2000|COR\n',
 'HS|11495||KNKQ208|07/09/

### announcement-like strings
* Using rows with (callsigns_no == 0), found announcement like information in lines around 

```python
dcd_lines[1388127:1399015]

[...
 'SF|11792|||KNKP986|P|10260397|3|ferring customer. The interim operator shall maintain records of sufficient detail to permit an audit that such costs have been incurred with respect to a particular subscriber.||\n',
 'SF|11792|||KNKP986|P|10260398|1|This authorization does not convey to the licensee the right to receive protection from the capture of subscriber traffic, co-channel interference or first-adjacent-channel interference in any area outside of the authorized Cellular Geographical Service A||\n',
 'SF|11792|||KNKP986|P|10260398|2|rea (CGSA) of the system. Moreover, any facility authorized herein with a service area boundary (SAB) extending into the CGSA of any other operating cellular system on the same channel block, regardless of when such other cellular system was/is authorized||\n',
 'SF|11792|||KNKP986|P|10260398|3|, is subject to the following condition: In the event that the licensee of the other cellular system requests that the SAB of the facilities authorized herein be removed from its CGSA, the licensee herein must reduce transmitting power or antenna height (||\n',
 'SF|11792|||KNKP986|P|10260398|4|or both) as necessary to remove the SAB from the CGSA, unless written consent from the licensee of the other cellular system, allowing the SAB extension, is obtained.||\n',
 'SF|11796|||KNKN448|P|11745651|1|The action taken with respect to Application #0004952609 does not preclude or prejudice any potential enforcement action regarding this Application, and does not constitute a waiver of any of the Commission�s rules with respect to this Application.||\n',
 'SF|11804|||KNKA684||2500315|1|PARAGRAPH A MODIFIED TO REQUIRE USE OF L-865 MEDIUM INTENSITY LIGHTS IN LIEU OF L-856.  LIGHTS SHALL EMIT A PEAK INTENSITY OF APPROXIMATELY 2,000 CANDELAS IN LIEU OF 4,000.||\n',
 'SF|11806|||KNKN663||10320593|1|THE FOLLOWING CELLULAR GEOGRAPHIC SERVICE AREAS HAVE BEEN COMBINED              (LISTED BY CALL SIGN, MARKET NUMBER AND BLOCK, AND MARKET NAME):               KNKN755    571A   NC 7                                                         KNKN558    572A||\n',
 'SF|11806|||KNKN663||10320593|2|||\n',
 'SF|11806|||KNKN663||10320594|1|The following Cellular Geographic Service Areas have been combine (listed by call sign, market number and block and market name): KNKN473 570A - NC 6, KNKN755 571A - NC 7, KNKN558 572A - NC 8, KNKN470 574A - NC 10, KNKN855 575A - NC 11, KNKN632 576A - NC||\n',
 'SF|11806|||KNKN663||10320594|2|||\n',
 'SF|11807|||KNKN882||10059309|1|THE FOLLOWING CELLULAR GEOGRAPHIC SERVICE AREAS HAVE BEEN COMBINED              (LISTED BY CALL SIGN, MARKET NUMBER AND BLOCK, AND MARKET NAME):               KNKA577    233B    WICHITA FALLS                                               KNKA724    260B||\n',
 'SF|11807|||KNKN882||10059309|2|||\n',
 'SF|11814|||KNKA537|P|10265055|1|This license is conditionally granted subject to the resolution of petitions for reconsideration filed against the Report and Order  in WT Docket No. 97 112 and CC Docket 90 6. See  Cellular Service and Other Commercial Mobile Radio Services in the Gulf o||\n',
 ...]
```

```python
dcd_lines[1389919:1389930]

['\n',
 'Special Condition Statement L23\n',
 'The ERPd is limited to 1924 Watts, 850_3G (UMT||\n',
 'SF|13062|||KNKN941|P|11771200|2|S) at Azimuth 164.2 degree True.\n',
 '\n',
 'Special Condition Statement L24\n',
 'The ERPd is limited to 215 Watts 850_3G (UMTS) at Azimuth 172.4 degrees True.\n',
 '\n',
 'To meet this Special Condition, the Applicant shall:\n',
 '1.  Use the final engineering submitted by Christia||\n',
 'SF|13062|||KNKN941|P|11771200|3|n LaTendresse on 24JUN2014 indicating that all facilities meet the ERP restriction.\n']
```


These lines correspond to multiple

TODO: 
* extract two letter code at beginning of each line; missing two letter code might indicate extraneous line break requiring correction

In [86]:
none_callsigns = re_dcd_lines[(re_dcd_lines['callsigns_no'] == 0)]
none_callsigns.reset_index(level=0, inplace=True)
none_callsigns = none_callsigns.sort_values(by=['index'])
none_callsigns

Unnamed: 0,index,str,line_cols,callsigns_no,callsigns_str,callsigns_lst
0,1928,HD|13422|||B|C|CL|||11/26/2002||||||||||||Y|||...,86,0,,[]
1,1932,HD|13426|||M|C|CL|||06/30/2005||||||||||||Y|||...,86,0,,[]
2,4481,"EN|13422|||B|CL||KOTEEN & NAFTALIN, LLP|PETER|...",141,0,,[]
3,4485,EN|13426|||M|CL||DRINKER BIDDLE & REATH LLP|||...,95,0,,[]
4,7221,EN|13422|||B|L|L00325780|DOBSON CELLULAR OF SA...,172,0,,[]
5,7225,"EN|13426|||M|L|L00126030|MISSOURI RSA 11, INC....",141,0,,[]
6,103881,HS|13422||B|12/15/1998|COR\n,28,0,,[]
7,103882,HS|13422||B|11/26/2002|COR\n,28,0,,[]
8,103883,HS|13422||B|11/26/2002|COR\n,28,0,,[]
9,104121,HS|13426||M|12/15/1998|COR\n,28,0,,[]


In [70]:
# ranges of interest due to missing callsigns -- possible broken lines?
dcd_lines[1928:1932]  # start of str = HD
dcd_lines[4481:4485]  # start of str = EN
dcd_lines[7221:7225]
dcd_lines[103881:103885] # start of str = HS
dcd_lines[104121:104123]
dcd_lines[1389919:1389930] 

Unnamed: 0,index,str,line_cols,callsigns_no,callsigns_str,callsigns_lst
0,1928,HD|13422|||B|C|CL|||11/26/2002||||||||||||Y|||...,86,0,,[]
1,1932,HD|13426|||M|C|CL|||06/30/2005||||||||||||Y|||...,86,0,,[]
2,4481,"EN|13422|||B|CL||KOTEEN & NAFTALIN, LLP|PETER|...",141,0,,[]
3,4485,EN|13426|||M|CL||DRINKER BIDDLE & REATH LLP|||...,95,0,,[]
4,7221,EN|13422|||B|L|L00325780|DOBSON CELLULAR OF SA...,172,0,,[]
5,7225,"EN|13426|||M|L|L00126030|MISSOURI RSA 11, INC....",141,0,,[]
6,103881,HS|13422||B|12/15/1998|COR\n,28,0,,[]
7,103882,HS|13422||B|11/26/2002|COR\n,28,0,,[]
8,103883,HS|13422||B|11/26/2002|COR\n,28,0,,[]
9,104121,HS|13426||M|12/15/1998|COR\n,28,0,,[]


In [84]:
dcd_lines[1389919:1389930]

['\n',
 'Special Condition Statement L23\n',
 'The ERPd is limited to 1924 Watts, 850_3G (UMT||\n',
 'SF|13062|||KNKN941|P|11771200|2|S) at Azimuth 164.2 degree True.\n',
 '\n',
 'Special Condition Statement L24\n',
 'The ERPd is limited to 215 Watts 850_3G (UMTS) at Azimuth 172.4 degrees True.\n',
 '\n',
 'To meet this Special Condition, the Applicant shall:\n',
 '1.  Use the final engineering submitted by Christia||\n',
 'SF|13062|||KNKN941|P|11771200|3|n LaTendresse on 24JUN2014 indicating that all facilities meet the ERP restriction.\n']

In [72]:
# The following lines all have announcement like information in them, and are unnecessarily split by '\n'
# they all begin with code SF

dcd_lines[1388127:1399015]

['SF|11792|||KNKP986|P|10260395|1|2. The interim operator must fully cooperate with the permanent licensee\n',
 'in effectuating a smooth transition to the provision of service in the market by the permanent licensee without disruption of service to the public. The interim operator must cease||\n',
 'SF|11792|||KNKP986|P|10260395|2|operations in the market on the date of initiation of permanent service or within 30 days of written notice by the permanent permittee to the interim operator of the day and time that it intends to initiate service, whichever date occurs later.||\n',
 'SF|11792|||KNKP986|P|10260396|1|3. This authorization may only be transferred in conjunction with the transfer of the authorization in the adjacent market and then only after approval is obtained from the Commission. This authorization must be identified as an interim operation authoriz||\n',
 'SF|11792|||KNKP986|P|10260396|2|ation in the transfer application filed with the Commission.||\n',
 'SF|11792|||KNKP9

### Observations about callsign info:
Note there are str lines with:
* 0, 1, 2 callsigns found using callsign_re: `'(?:\|)([A-Z]{4}\d{3}|L\d{9})'`
* for lines with 2 callsigns found, there are 1 or 2 unique callsigns
`re_dcd_lines.groupby(['callsigns_no','callsigns_str']).count()`
    
[TODO]: 
* difference between lines with 1 callsign found; and 2 identical callsigns

### --- ignore below here ----

## EXPLORING column misalignment error

In [243]:
content_list = open(decoded_file, 'rb').readlines()
len(content_list) #1559145
content_list[8657:8659]

1559145

In [196]:
A = df.loc[0:8657, 4]

In [197]:
A.head()

0    KNNA286
1    KNNA289
2    KNKQ208
3    KNKQ210
4    KNKN848
Name: 4, dtype: object

In [198]:
A.unique()

array(['KNNA286', 'KNNA289', 'KNKQ208', ..., 'L000033075', 'L000033684',
       'L000033685'], dtype=object)

In [208]:
licenses = pd.DataFrame(A.unique())

In [209]:
len(licenses)

2757

In [214]:
licenses.sort_values(by=0)

Unnamed: 0,0
1928,B
1676,KNKA200
1677,KNKA201
930,KNKA202
1157,KNKA203
1702,KNKA204
1442,KNKA205
1704,KNKA206
1124,KNKA207
45,KNKA208


In [211]:
type(licenses)

pandas.core.frame.DataFrame

In [212]:
print(licenses)

               0
0        KNNA286
1        KNNA289
2        KNKQ208
3        KNKQ210
4        KNKN848
5        KNNA275
6        KNNA346
7        KNKQ246
8        KNKQ223
9        KNNA330
10       KNKN893
11       KNNA308
12       KNKP995
13       KNKQ209
14       KNKQ206
15       KNNA254
16       KNKA667
17       KNKA679
18       KNKA700
19       KNKA755
20       KNKA817
21       KNKA467
22       KNKA627
23       KNKA362
24       KNKA581
25       KNKA529
26       KNKA469
27       KNKA214
28       KNKA490
29       KNKA777
...          ...
2727  L000024958
2728  L000024959
2729  L000025328
2730  L000025499
2731  L000025500
2732     WRAQ634
2733  L000026538
2734  L000026539
2735  L000026540
2736  L000026570
2737  L000026613
2738  L000030940
2739  L000030941
2740  L000030943
2741  L000030944
2742  L000030945
2743  L000030946
2744  L000031407
2745  L000031408
2746     WRBT330
2747     WRBT612
2748     WRBZ793
2749  L000032268
2750  L000032269
2751  L000032767
2752  L000033073
2753  L0000330

### import errors

In [115]:
# import a single line as str
with open(file_path, 'rb') as open_file:
    content1 = open_file.readline().decode(errors='replace')
    open_file.close()

print(content1)
    
# convert single line to 
data = StringIO(content1)
df = pd.read_csv(data, sep="|", header=None, lineterminator="\n")
df.head()

HD|11493|NONE|NONE|KNNA286|C|CL|NONE|NONE|02/07/1997|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|Y|NONE|NONE|NONE|Y|NONE|NONE|N|Y|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|NONE|10/01/2003|NONE|NONE|NONE|NONE|NONE|



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,HD,11493,NONE,NONE,KNNA286,C,CL,NONE,NONE,02/07/1997,...,NONE,NONE,NONE,10/01/2003,NONE,NONE,NONE,NONE,NONE,


In [247]:
N=7300

with open(source_file) as myfile:
    head = [next(myfile) for x in range(N)]
    myfile.close()

N_lines = "".join(head)
print(N_lines)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb0 in position 7330: invalid start byte

In [None]:
def insert_none_cols(source_str):
    # replace empty columns with NONE values
    none_str = source_str.replace('||', '|NONE|').replace('||', '|NONE|')
    
    return(none_str)