In [4]:
import pandas as pd

parquet_file = 'EOTNL.parquet'
df = pd.read_parquet(parquet_file, engine='auto')


In [33]:
# Display all of the columns in Dataframe for reference
df.columns.tolist()

['url_surtkey',
 'url',
 'url_host_name',
 'url_host_tld',
 'url_host_2nd_last_part',
 'url_host_3rd_last_part',
 'url_host_4th_last_part',
 'url_host_5th_last_part',
 'url_host_registry_suffix',
 'url_host_registered_domain',
 'url_host_private_suffix',
 'url_host_private_domain',
 'url_host_name_reversed',
 'url_protocol',
 'url_port',
 'url_path',
 'url_query',
 'fetch_time',
 'fetch_status',
 'content_digest',
 'content_mime_type',
 'content_mime_detected',
 'content_charset',
 'content_languages',
 'content_puid',
 'warc_filename',
 'warc_record_offset',
 'warc_record_length',
 'warc_segment',
 'crawl',
 'subset']

In [10]:
# Show earliest and latest captured URL Surts
df.sort_values('fetch_time')[['url_surtkey', 'fetch_time']]

Unnamed: 0,url_surtkey,fetch_time
797592,"gov,bnl)/robots.txt",2012-09-13 07:38:00
36794,"gov,ameslab)/robots.txt",2012-09-13 07:38:00
103211,"gov,anl)/robots.txt",2012-09-13 07:38:00
746360,"gov,bnl)/",2012-09-13 07:38:03
75751,"gov,anl)/",2012-09-13 07:38:40
...,...,...
2133206,"gov,fnal,www-ese)/eseproj/dart/dcdm/img/r_tabl...",2013-03-31 16:26:57
2133208,"gov,fnal,www-ese)/eseproj/dart/dcdm/img/select...",2013-03-31 16:26:58
2133210,"gov,fnal,www-ese)/eseproj/dart/dcdm/img/term1.gif",2013-03-31 17:27:04
2133406,"gov,fnal,www-ese)/eseproj/svx/ddr/..\..\..\..\...",2013-03-31 20:27:19


In [26]:
# Find the earliest URL crawled per domain
df.groupby('url_host_private_domain')['fetch_time'].min()

url_host_private_domain
ameslab.gov    2012-09-13 07:38:00
anl.gov        2012-09-13 07:38:00
bnl.gov        2012-09-13 07:38:00
doe.gov        2012-09-15 12:12:14
fnal.gov       2012-09-13 07:40:13
inl.gov        2012-09-13 07:43:40
jlab.org       2012-09-13 07:49:00
lanl.gov       2012-09-13 07:43:42
lbl.gov        2012-09-15 13:41:14
llnl.gov       2012-09-13 07:43:42
nrel.gov       2012-09-13 07:45:42
ornl.gov       2012-09-13 07:45:43
pnnl.gov       2012-09-13 07:46:11
pppl.gov       2012-10-26 22:19:32
sandia.gov     2012-09-13 07:46:33
stanford.edu   2012-10-02 06:26:44
Name: fetch_time, dtype: datetime64[ns]

In [56]:
# Find the latest URL crawled per domain
df.groupby('url_host_private_domain')['fetch_time'].max()

url_host_private_domain
ameslab.gov    2013-03-21 01:23:30
anl.gov        2013-03-25 06:56:19
bnl.gov        2013-03-21 20:12:30
doe.gov        2013-03-21 01:19:42
fnal.gov       2013-03-31 21:27:25
inl.gov        2013-03-21 02:11:20
jlab.org       2013-03-22 15:39:22
lanl.gov       2013-03-31 07:23:28
lbl.gov        2013-03-21 02:05:11
llnl.gov       2013-03-21 10:10:12
nrel.gov       2013-03-21 00:36:25
ornl.gov       2013-03-21 03:45:31
pnnl.gov       2013-03-21 00:37:49
pppl.gov       2013-03-20 23:03:56
sandia.gov     2013-03-21 01:23:38
stanford.edu   2013-03-20 23:07:29
Name: fetch_time, dtype: datetime64[ns]

In [57]:
# Show the number of URLs grouped by fetch_status and then by domain
pd.set_option('display.max_rows', 500)
df.groupby(['url_host_private_domain', 'fetch_status'])['fetch_status'].count()

url_host_private_domain  fetch_status
ameslab.gov              -1                 479
                          200             68126
                          300                 6
                          301               372
                          302              3784
                          400                 4
                          403               466
                          404              1852
                          500                60
anl.gov                  -1                8117
                          200            620147
                          300                 9
                          301              4867
                          302             11405
                          303               125
                          400               547
                          401              1107
                          403              2356
                          404             21387
                          405                21
  

In [60]:
df.groupby(['url_host_registered_domain', 'content_mime_detected'])['content_mime_detected'].count()

url_host_registered_domain  content_mime_detected   
ameslab.gov                 application/CDFV2            295
                            application/mac-binhex40       6
                            application/octet-stream      35
                            application/pdf             5356
                            application/postscript         2
                                                        ... 
stanford.edu                text/html                    156
                            text/plain                    47
                            text/troff                     1
                            text/x-c++                     1
                            text/xml                      23
Name: content_mime_detected, Length: 793, dtype: int64

In [65]:
# Get the number of PDFs (by detected mime type) for each domain.
pdf_df = df.loc[df["content_mime_detected"] == 'application/pdf']
pdf_df.groupby(['url_host_registered_domain', 'content_mime_detected'])['content_mime_detected'].count()

url_host_registered_domain  content_mime_detected
ameslab.gov                 application/pdf           5356
anl.gov                     application/pdf          26092
bnl.gov                     application/pdf          21691
doe.gov                     application/pdf          11367
fnal.gov                    application/pdf          36146
inl.gov                     application/pdf          10845
jlab.org                    application/pdf           8277
lanl.gov                    application/pdf          20094
lbl.gov                     application/pdf              9
llnl.gov                    application/pdf           8952
nrel.gov                    application/pdf          10507
ornl.gov                    application/pdf          36128
pnnl.gov                    application/pdf           4935
sandia.gov                  application/pdf          11054
Name: content_mime_detected, dtype: int64