* [Link to download this document](https://bit.ly/48OowL5)
* [Code repo](https://bit.ly/3SNWFV4)


#### Crawling code

In [1]:
import advertools as adv
import adviz
import pandas as pd
import plotly.express as px
import networkx as nx
from IPython.display import display_markdown
pd.options.display.max_columns = None

def md(text):
    return display_markdown(f'{text}', raw=True)

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)



```python
adv.crawl(
    url_list='https://www.nasa.gov/',
    output_file='nasa_crawl.jl',
    follow_links=True,
    custom_settings={
        'CLOSESPIDER_PAGECOUNT': 1000,
        'LOG_FILE': 'nasa_crawl.log',
        'JOBDIR': 'nasa_crawl',
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 6
    })

# convert from .jl to .parquet:
adv.crawlytics.jl_to_parquet('nasa_crawl.jl', 'nasa_crawl.parquet')
```

In [2]:
crawldf = pd.read_parquet('nasa_crawl.parquet')
crawldf.head(3)

Unnamed: 0,url,title,meta_desc,viewport,charset,h2,h3,canonical,alt_href,og:locale,og:type,og:title,og:description,og:url,og:site_name,og:updated_time,og:image,og:image:secure_url,og:image:width,og:image:height,og:image:alt,og:image:type,og:video,og:video_1,og:video_2,og:video_3,og:video_4,og:video_5,og:video_6,og:video_7,og:video_8,og:video_9,og:video_10,og:video_11,og:video_12,og:video_13,og:video_14,og:video_15,og:video_16,og:video_17,og:video_18,og:video_19,og:video_20,og:video_21,og:video_22,og:video_23,twitter:card,twitter:title,twitter:description,twitter:image,jsonld_@context,jsonld_@graph,body_text,size,download_timeout,download_slot,download_latency,depth,status,links_url,links_text,links_nofollow,nav_links_url,nav_links_text,nav_links_nofollow,header_links_url,header_links_text,header_links_nofollow,footer_links_url,footer_links_text,footer_links_nofollow,img_fetchpriority,img_width,img_height,img_alt,img_src,img_srcset,img_decoding,img_sizes,img_loading,ip_address,crawl_time,resp_headers_Content-Length,resp_headers_Server,resp_headers_Date,resp_headers_Content-Type,resp_headers_Host-Header,resp_headers_X-Launch-Status,resp_headers_Link,resp_headers_X-Rq,resp_headers_Cache-Control,resp_headers_Age,resp_headers_X-Cache,resp_headers_Vary,resp_headers_Accept-Ranges,resp_headers_Strict-Transport-Security,request_headers_Accept,request_headers_Accept-Language,request_headers_User-Agent,request_headers_Accept-Encoding,h1,h4,request_headers_Referer,twitter:label1,twitter:data1,twitter:label2,twitter:data2,redirect_times,redirect_ttl,redirect_urls,redirect_reasons,resp_headers_Last-Modified,resp_headers_Etag,resp_headers_Access-Control-Allow-Origin,resp_headers_Access-Control-Allow-Methods,resp_headers_Content-Disposition,twitter:image:alt,twitter:image:width,twitter:image:height,resp_headers_X-Powered-By,h5,jsonld_@type,jsonld_headline,jsonld_url,jsonld_thumbnailUrl,jsonld_articleSection,jsonld_author,jsonld_creator,jsonld_keywords,jsonld_dateCreated,jsonld_datePublished,jsonld_dateModified,jsonld_mainEntityOfPage.@type,jsonld_mainEntityOfPage.@id,jsonld_image.@type,jsonld_image.url,jsonld_publisher.@type,jsonld_publisher.name,jsonld_publisher.logo,resp_headers_X-Hacker,img_align,resp_headers_X-Content-Type-Options,resp_headers_X-Xss-Protection,resp_headers_Content-Security-Policy,resp_headers_Set-Cookie,resp_headers_Expires,resp_headers_X-Frame-Options,resp_headers_Server-Timing,resp_headers_Timing-Allow-Origin,request_headers_Cookie,resp_headers_X-Amz-Id-2,resp_headers_X-Amz-Request-Id,resp_headers_X-Amz-Replication-Status,resp_headers_X-Amz-Server-Side-Encryption,resp_headers_X-Amz-Meta-Cb-Modifiedtime,resp_headers_X-Amz-Version-Id,resp_headers_X-Ua-Compatible,resp_headers_Referrer-Policy,resp_headers_Via,resp_headers_X-Amz-Cf-Pop,resp_headers_X-Amz-Cf-Id,h6,img_border,img_hspace,img_vspace,resp_headers_Content-Language,resp_headers_Cf-Cache-Status,resp_headers_Cf-Ray,img_ismap,img_usemap
0,https://www.nasa.gov/,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...","width=device-width, initial-scale=1",UTF-8,Suggested Searches@@Martians Wanted@@Featured News@@NASA’s SpaceX Crew-8@@Image Of The Day@@Image Of The Day@@Our Ch...,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",https://www.nasa.gov/,https://www.nasa.gov/feed/@@https://www.nasa.gov/wp-json/wp/v2/pages/128943@@https://www.nasa.gov/wp-json/oembed/1.0...,en_US,website,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...",https://www.nasa.gov/,NASA,2024-02-17T01:14:00-05:00,https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,640.0,512.0,NASA Meatball paint refresh,image/jpeg,https://www.youtube.com/embed/21X5lGlDOfg,https://www.youtube.com/embed/NpHFB_DYXhY,https://www.youtube.com/embed/_LJHRpDvPCw,https://www.youtube.com/embed/bTQjiMtpMG0,https://www.nasa.gov/wp-content/uploads/2023/11/final-nasa-15-sec-horizontal-16-9.mp4,https://www.youtube.com/embed/1fOWosS_f1Y,https://www.youtube.com/embed/31b1yjUBlO0,https://www.youtube.com/embed/MTyzq4ey9RE,https://www.youtube.com/embed/OffTxAiAQfM,https://www.youtube.com/embed/ZbBx4sW68uw,https://www.youtube.com/embed/vUYcQ_ehArw,https://www.youtube.com/embed/YQWespzOtzI,https://www.youtube.com/embed/CRZYw9fEBe4,https://www.youtube.com/embed/IGuHErKAiHs,https://www.youtube.com/embed/R-TOoGTvFL8,https://www.youtube.com/embed/p566jU9pylY,https://www.youtube.com/embed/_tdsia6EZY8,https://www.youtube.com/embed/hW5akI5Rnyg,https://www.youtube.com/embed/WQR_iNjEjlw,https://www.youtube.com/embed/iDAKTLmt2hs,https://www.youtube.com/embed/VwVL0UBVVLA,https://www.youtube.com/embed/Ha4mXufQp6c,https://www.youtube.com/embed/sgp_2OBxKeM,https://www.youtube.com/embed/UyXS2tYggiE,summary_large_image,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...",https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,299218,180,www.nasa.gov,0.064869,0,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@high@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@eager@@lazy@@@@@@@@lazy@@lazy@@lazy@@laz...,192.0.66.108,2024-02-19 08:46:43,31913.0,nginx,"Mon, 19 Feb 2024 08:46:43 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/"",<https://www.nasa.gov/wp-json/wp/v2/pages/128943>; rel=""al...",hhn1 85 187 443,"max-age=300, must-revalidate",666.0,hit,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,https://www.nasa.gov/?search=SpaceX%20Crew-2,"3522 Search Results for ""SpaceX Crew-2""",,"width=device-width, initial-scale=1",UTF-8,Suggested Searches@@\n\t\t\t\t3522 results found\t\t\t\t\t,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",,https://www.nasa.gov/feed/,en_US,website,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,summary_large_image,,,,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,234115,180,www.nasa.gov,1.301717,1,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",,192.0.66.108,2024-02-19 08:46:47,,nginx,"Mon, 19 Feb 2024 08:46:47 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/""",hhn1 85 187 443,"max-age=300, must-revalidate",0.0,miss,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",\n\t\t\t\t\tSearch Results for: SpaceX Crew-2\t\t\t\t,The SpaceX Freedom Dragon crew ship with the Axiom Mission-2 crew - NASA@@NASA's SpaceX Crew-7@@The SpaceX Freedom D...,https://www.nasa.gov/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,https://www.nasa.gov/?search=International%20Space%20Station,"28773 Search Results for ""International Space Station""",,"width=device-width, initial-scale=1",UTF-8,Suggested Searches@@\n\t\t\t\t28773 results found\t\t\t\t\t,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",,https://www.nasa.gov/feed/,en_US,website,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,summary_large_image,,,,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,233543,180,www.nasa.gov,2.509646,1,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",,192.0.66.108,2024-02-19 08:46:47,,nginx,"Mon, 19 Feb 2024 08:46:47 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/""",hhn1 85 188 443,"max-age=300, must-revalidate",0.0,miss,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",\n\t\t\t\t\tSearch Results for: International Space Station\t\t\t\t,International Space Station - NASA@@20 Years of Observing Earth from the International Space Station - NASA@@SpaceX ...,https://www.nasa.gov/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [3]:
md(f'''
### Rows: {crawldf.shape[0]:,}
### Columns: {crawldf.shape[1]}
--- ''')


### Rows: 9,942
### Columns: 170
--- 

## Link summary table
Maps links on all crawled URLs together with anchor text, rel nofollow attribute, and whether or not it's an internal link.

In [4]:
link_df = adv.crawlytics.links(crawldf, internal_url_regex=r'nasa\.gov')
link_df

Unnamed: 0,url,link,text,nofollow,internal
0,https://www.nasa.gov/,https://www.nasa.gov/,\n\t\t\t\t\n\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/,\n\t\t\t\t\n\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/news/,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/news/all-news/,\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t\t\t\t,False,True
0,https://www.nasa.gov/,https://plus.nasa.gov/series/,\n\t\t\t\t\t\t\t\t\t\tVideo Series on NASA+\n\t\t\t\t\t\t\t\t\t,False,True
...,...,...,...,...,...
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,http://oig.nasa.gov/,Office of the IG,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/budgets-plans-and-reports/,Budget & Annual Reports,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/organizations/budget-annual-reports/agency-financial-reports/,Agency Financial Reports,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/contact-nasa/,Contact NASA,False,True


## External URLs split into their components

In [5]:
external = link_df[~link_df['internal']]
external_urldf = adv.url_to_df(external['link'].fillna('').drop_duplicates())
external_urldf

Unnamed: 0,url,scheme,netloc,path,query,fragment,dir_1,dir_2,dir_3,dir_4,dir_5,dir_6,dir_7,dir_8,dir_9,dir_10,dir_11,dir_12,dir_13,last_dir,query_v,query_id,query_list,query_solId,query_mt,query_index,query_path,query_method,query_t,query_feature,query_hl,query_aff,query_keywords,query_sort,query_is_active,query_page,query_app,query_lang,query_in,query_pp,query_ls,query_utm_campaign,query_utm_medium,query_p,query_utm_source,query_f,query_type,query_si,query_date_filter_index,query_opp_response_date_filter_model,query_inactive_filter_values,query_d,query_organization_id,query_amp;si,query_s,query_mc_cid,query_sit,query_m,query_llr,query_ref_src,query_ab_channel,query_mc_eid,query_goal,query_utm_term,query_context,query_user,query_ved,query_sa,query_u,query_rtc,query_gl,query_ign-mpt,query_as_sdtp,query_as_sdtAAP,query_as_occt,query_as_sauthors,query_as_q,query_btnG,query_sciodt,query_as_sdt,query_cites,query_via,query_dbname,query_docid,query_sh,query_URI,query_c,query_view,query_news,query_doi,query_mobileUi,query_r,query_cluster,query_time_continue,query_variant,query_itemsPerPage,query_k,query_screen_name,query_access,query_source,query_linkId,query_key,query_redirect_after_login,query_articleID,query_qt-staff_profile_science_products,query_facultyid,query_arnumber,query_oi,query_tab_body,query_idno,query_node,query_rgn,query_SID,query_div,query_observatory,query_tpl,query_mode,query_tab,query_size,query_category,query_token,query_collectionCode,query_Sect1,query_Sect2,query_SSO,query_Itemid,query_l,query__cview,query_co1,query_s1,query_fileID,query_OS,query_RS,query_articleid,query_journalid,query_org,query_cws,query_clientId,query_option,query_edition,query_IssueID,query_ident_number,query_commid,query_EventID,query_jwsource,query_PubType,query_year,query_dod-date,query_explnum_id,query_TopID,query_SubID,query_q,query_proj,query_sub,query_y,query_ListGuid,query_AudienceTarget,query_req,query_num,query_historical,query_st,query_,query_sb,query_ps,query_newsId,hostname,port,query_query,query_record_id,query_read,query_Ident,query_term,query_mc,query_submit,query_saved,query_link,query_show,query_acontext,query_cfr[date],query_oq,query_cfr[reference],query_CFRPart,query_sharing_token,query_code,query_url,query_verb,query_metadataPrefix,query_identifier,query_toc,query_sid,query_fobjectid,query_last_nm,query_first_nm,query_ref,query_vm,query_by_subject,query_by_type,query_tabs
0,https://www.facebook.com/NASA,https,www.facebook.com,/NASA,,,NASA,,,,,,,,,,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,https://www.instagram.com/nasa/,https,www.instagram.com,/nasa/,,,nasa,,,,,,,,,,,,,nasa,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,https://x.com/NASA,https,x.com,/NASA,,,NASA,,,,,,,,,,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,https://www.youtube.com/@NASA,https,www.youtube.com,/@NASA,,,@NASA,,,,,,,,,,,,,@NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,https://touchpoints.app.cloud.gov/touchpoints/5d5606f0/submit,https,touchpoints.app.cloud.gov,/touchpoints/5d5606f0/submit,,,touchpoints,5d5606f0,submit,,,,,,,,,,,submit,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3918,https://www.flickr.com/photos/nasafo/albums,https,www.flickr.com,/photos/nasafo/albums,,,photos,nasafo,albums,,,,,,,,,,,albums,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3919,https://www.faa.gov/news/fact_sheets/news_story.cfm?newsId=20074,https,www.faa.gov,/news/fact_sheets/news_story.cfm,newsId=20074,,news,fact_sheets,news_story.cfm,,,,,,,,,,,news_story.cfm,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20074,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3920,http://adsabs.harvard.edu/abs/2015AGUFMSA34A..04I,http,adsabs.harvard.edu,/abs/2015AGUFMSA34A..04I,,,abs,2015AGUFMSA34A..04I,,,,,,,,,,,,2015AGUFMSA34A..04I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3921,http://onlinelibrary.wiley.com/doi/10.1002/2016JA022363/full,http,onlinelibrary.wiley.com,/doi/10.1002/2016JA022363/full,,,doi,10.1002,2016JA022363,full,,,,,,,,,,full,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Most linked-to domains

In [6]:
adviz.value_counts_plus(
    external_urldf['netloc'],
    show_top=20,
    size=14,
    name='External link domains',
    background_gradient='Blues'
).set_caption('')

Unnamed: 0,External link domains,count,cum. count,%,cum. %
1,www.youtube.com,258,258,6.6%,6.6%
2,doi.org,133,391,3.4%,10.0%
3,twitter.com,118,509,3.0%,13.0%
4,sam.gov,89,598,2.3%,15.2%
5,www.flickr.com,83,681,2.1%,17.4%
6,youtu.be,81,762,2.1%,19.4%
7,www.facebook.com,77,839,2.0%,21.4%
8,www.nature.com,64,903,1.6%,23.0%
9,dx.doi.org,61,964,1.6%,24.6%
10,www.esa.int,54,1018,1.4%,25.9%


### Crawling headers code (running `HEAD` requests to external links)

```python
adv.crawl_headers(
    external['link'].drop_duplicates().dropna(),
    'nasa_external_link_headers.jl',
    custom_settings={
        'AUTOTHROTTLE_ENABLED': False,
        'LOG_FILE': 'nasa_external_link_headers.log',
    })
headers = pd.read_json('nasa_external_link_headers.jl', lines=True)
```

## External link status code counts

In [10]:
headers = pd.read_json('nasa_external_link_headers.jl', lines=True)
adviz.value_counts_plus(
    headers['status'].dropna().astype(int).astype(str),
    name='External link status codes<br><u>nasa.gov</u>',
    show_top=20,
    size=14,
    background_gradient='Blues').set_caption('')

Unnamed: 0,External link status codes nasa.gov,count,cum. count,%,cum. %
1,200,2200,2200,80.6%,80.6%
2,403,336,2536,12.3%,92.9%
3,404,149,2685,5.5%,98.3%
4,405,10,2695,0.4%,98.7%
5,429,10,2705,0.4%,99.0%
6,400,9,2714,0.3%,99.4%
7,500,8,2722,0.3%,99.7%
8,503,4,2726,0.1%,99.8%
9,410,1,2727,0.0%,99.9%
10,522,1,2728,0.0%,99.9%


### External links with status code != 200

In [11]:
external_links_errors = (headers
                         [headers['status'].ne(200)]
                         .dropna(subset=['status'])
                         [['url', 'status']]
                        .rename(columns={'url': 'link'}))
external_links_errors['status'] = external_links_errors['status'].astype(int)
external_links_errors.head()

Unnamed: 0,link,status
7,https://apps.apple.com/us/app/nasa-technology-innovation/id626341580?mt=,404
9,https://play.google.com/store/apps/details?id=gov.nasa.gsfc.iswa.NASASpaceWeather,404
21,https://apps.apple.com/us/app/rescue-406/id694112931,404
22,https://apps.apple.com/us/app/nasa-space-weather-media-viewer/id398687618,404
23,https://apps.apple.com/us/app/satellite-insight/id463588902,404


## Locating the broken external links (sample of 25 links)
* url: where the broken link is
* link: the broken link
* text: its anchor text

In [12]:
external_broken_links = pd.merge(link_df, external_links_errors, left_on='link', right_on='link')
external_broken_links.drop('internal', axis=1).head(25)

Unnamed: 0,url,link,text,nofollow,status
0,https://www.nasa.gov/apps/,https://play.google.com/store/apps/details?id=gov.nasa.gsfc.iswa.NASASpaceWeather,Android,False,404
1,https://www.nasa.gov/directorates/stmd/just-updated-nasas-technology-innovation/,https://nasa.us5.list-manage.com/subscribe?u=b64e374f623dc7bf83fa94fc1&id=b1fe681edb,Click HERE to sign up to receive updates to the digital publication.,False,404
2,https://www.nasa.gov/centers-and-facilities/ames/nasa-study-reveals-compounding-climate-risks-at-two-degrees-of-warm...,https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2022EF003330,study,False,403
3,https://www.nasa.gov/earth-and-climate/the-climate-events-of-2020-show-how-excess-heat-is-expressed-on-earth/,https://nsidc.org/greenland-today/2020/11/greenlands-2020-melt-season-in-review/,23.1 million square kilometers of Greenland’s ice sheet,False,404
4,https://www.nasa.gov/niac-symposium/,https://livestream.com/viewnow/niac2023,2023 Symposium,False,404
5,https://www.nasa.gov/niac-symposium/,https://livestream.com/viewnow/niac2022,2022 Symposium,False,404
6,https://www.nasa.gov/niac-symposium/,https://livestream.com/viewnow/niac2021,2021 Symposium,False,404
7,https://www.nasa.gov/niac-symposium/,https://livestream.com/viewnow/niac2020,2020 Symposium,False,404
8,https://www.nasa.gov/news-release/2020-tied-for-warmest-year-on-record-nasa-analysis-shows/,https://www.noaa.gov/news/2020-was-earth-s-2nd-hottest-year-just-behind-2016,"separate, independent analysis",False,403
9,https://www.nasa.gov/news-release/2020-tied-for-warmest-year-on-record-nasa-analysis-shows/,https://www.noaa.gov/,National Oceanic and Atmospheric Administration,False,403
