# Processing the EDGAR Data Set

Given a year, get data for the first day of the month for every month in the year. Example URL: http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2003/Qtr1/log20030101.zip

### Generate URLs

In [1]:
year = 2005 # environmental variable

In [2]:
url_base = 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/'

In [3]:
url_mid = '/log' + str(year)

In [4]:
url_final = '01.zip'

In [5]:
urls = []
# use range to get all 12 months
for x in range(1,13):
    if (x < 4):
        urls.append(url_base + str(year) + '/Qtr1' + url_mid + '0' + str(x) + url_final)
    elif (x >=4 and x < 7):
        urls.append(url_base + str(year) + '/Qtr2' + url_mid + '0' + str(x) + url_final)
    elif (x >= 7 and x < 10):
        urls.append(url_base + str(year) + '/Qtr3' + url_mid + '0' + str(x) + url_final)
    else:
        urls.append(url_base + str(year) + '/Qtr4' + url_mid + str(x) + url_final)
        
print(urls)

['http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr1/log20050101.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr1/log20050201.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr1/log20050301.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr2/log20050401.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr2/log20050501.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr2/log20050601.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr3/log20050701.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr3/log20050801.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr3/log20050901.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr4/log20051001.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qtr4/log20051101.zip', 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2005/Qt

The log files are large, so we will read them from the zip files instead of extracting them

In [6]:
import zipfile
import requests
from io import StringIO,BytesIO
import pandas as pd
from pandas import *
import re
import numpy
import scipy
from scipy import stats

In [7]:
column_names = ['ip','date','time','zone','cik','accession','doc/extention','code','filesize','idx','norefer','noagent','find','crawler','browser']
full_logs = pd.DataFrame(columns=column_names)

Default Values:
* ip: unknown
* date: derive from file date
* time: unknown
* zone: 0
* cik: unknown
* accession: unknown
* doc: unknown
* code: unknown
* size: mean
* idx: 0
* noreferer: 0
* noagent: 0
* find: 0
* crawler: 0
* browser: unknown

Numeric defaults are based on the README that comes with each log

In [8]:
dataframe_collection = []
for url in urls:
    #print url
    r = requests.get(url, stream=True)
    z = zipfile.ZipFile(BytesIO(r.content))
    st = pd.read_csv(z.open(url[-15:-3]+'csv'))
    
    # Handle Missing Data
    
    # Replace missing numeric values and categorical that have default 0
    st.fillna({'zone': 0, 'idx': 0,'noreferer': 0,'noagent': 0,'find': 0,'crawler': 0}, inplace=True)
    # Replace missing categorical values
    st.fillna({'ip':'unknown', 'time':'unknown','cik':'unknown','accession':'unknown','code':'unknown','browser':'unknown'}, inplace=True)
    st.fillna({'doc':'unknown', 'extention':'unknown'}, inplace=True)
    # Replace missing dates from file name
    date_formatted = url[-12:-8]+'-'+url[-8:-6] + '-' + url[-6:-4]
    st.fillna({'date': date_formatted}, inplace=True)
    # Replace missing file sizes with mean
    st.fillna({'size': scipy.mean(st['size'])}, inplace=True)
    dataframe_collection.append(st)

In [16]:
nullvaluesindf=[]
alldfs=[]
for i in range(0,len(dataframe_collection)):
    nullvaluesindf.append(pd.isnull(dataframe_collection[i]).sum()>0)
    alldfs.append(nullvaluesindf)

In [18]:
dataframe_collection[0].describe()

Unnamed: 0,zone,cik,code,size,idx,norefer,noagent,find,crawler
count,168657.0,168657.0,168657.0,168657.0,168657.0,168657.0,168657.0,168657.0,168657.0
mean,500.0,855770.9,219.970639,175819.3,0.464143,0.849725,0.069004,0.605946,0.040485
std,0.0,372546.6,41.105357,533327.9,0.498714,0.357342,0.253462,2.06583,0.197093
min,500.0,20.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,500.0,775181.0,200.0,2648.0,0.0,1.0,0.0,0.0,0.0
50%,500.0,929351.0,200.0,4673.0,0.0,1.0,0.0,0.0,0.0
75%,500.0,1104730.0,200.0,175819.3,1.0,1.0,0.0,0.0,0.0
max,500.0,1313239.0,404.0,38103650.0,1.0,1.0,1.0,10.0,1.0


In [19]:
dataframe_collection[0]['accession'].value_counts()

0001193125-04-221736    727
0001125282-04-006596    726
0000891618-04-001402    726
0001193125-04-221735    726
0001193125-04-221734    724
0000882377-04-002779    724
0001071787-04-001609    723
0001071787-04-001608    722
0000950144-04-012484    722
0000922423-04-002111    722
0001071787-04-001610    722
0000909334-04-000362    721
0000882377-04-002780    721
0001117768-03-000002    322
0001214305-04-000013    273
0001312289-04-000002    213
0001254639-04-000012    211
0001209191-04-059962    197
0000909450-04-000102    192
0001047469-02-004136    162
0001047469-03-005185    153
0000950127-04-001171    128
0001047469-04-017079    124
0001047469-02-008768    120
0000945234-04-000784    117
0001047469-04-014104    114
0001047469-03-005454    113
0001047469-03-002575    111
0001169232-04-006220    110
0001047469-03-013785    110
                       ... 
0000950109-00-001367      1
0000950129-02-004400      1
0000906648-03-000080      1
0000950112-99-000719      1
0000891804-03-000918

For each log file:
* Handle missing data xx
* Compute summary metrics (Decide which ones)
* Check for any observable anomalies
* Your program should log all the operations (with time stamps) into a log file.
* Compile all the data and summaries of the 12 files into one file

In [75]:
for i in range(0,len(dataframe_collection)):
    df=dataframe_collection[i]
    full_logs=pd.concat([full_logs,df])

In [76]:
full_logs

Unnamed: 0,accession,browser,cik,code,crawler,date,doc/extention,extention,filesize,find,idx,ip,noagent,norefer,size,time,zone
0,0000879123-04-000076,mie,933239.0,200.0,0.0,2005-01-01,,-index.htm,,0.0,1.0,68.70.131.gjc,0.0,1.0,2963.0,00:00:00,500.0
1,0001209191-04-059913,mie,932696.0,200.0,0.0,2005-01-01,,-index.htm,,0.0,1.0,68.70.131.gjc,0.0,1.0,2579.0,00:00:00,500.0
2,0000928022-04-000071,mie,928022.0,200.0,0.0,2005-01-01,,may327.xml,,0.0,0.0,68.70.131.gjc,0.0,1.0,12615.0,00:00:00,500.0
3,0001179110-04-024293,mie,924901.0,200.0,0.0,2005-01-01,,edgar.xml,,0.0,0.0,68.70.131.gjc,0.0,1.0,10032.0,00:00:00,500.0
4,0001276466-04-000005,mie,929351.0,200.0,0.0,2005-01-01,,primary_doc.xml,,0.0,0.0,68.70.131.gjc,0.0,1.0,8307.0,00:00:00,500.0
5,0000909654-04-003140,mie,930305.0,200.0,0.0,2005-01-01,,.txt,,0.0,0.0,68.70.131.gjc,0.0,1.0,5558.0,00:00:00,500.0
6,0000928022-04-000070,mie,928022.0,200.0,0.0,2005-01-01,,cal326.xml,,0.0,0.0,68.70.131.gjc,0.0,1.0,13899.0,00:00:00,500.0
7,0001276466-04-000005,mie,929351.0,200.0,0.0,2005-01-01,,-index.htm,,0.0,1.0,68.70.131.gjc,0.0,1.0,3074.0,00:00:00,500.0
8,0000928022-04-000073,mie,928022.0,200.0,0.0,2005-01-01,,-index.htm,,0.0,1.0,68.70.131.gjc,0.0,1.0,2724.0,00:00:00,500.0
9,0001254639-04-000012,mie,929351.0,200.0,0.0,2005-01-01,,.txt,,0.0,0.0,68.70.131.gjc,0.0,1.0,14260.0,00:00:00,500.0
