#### Crawling code

In [1]:
import advertools as adv
import adviz
import pandas as pd
import plotly.express as px
import networkx as nx
from IPython.display import display_markdown
pd.options.display.max_columns = None

def md(text):
    return display_markdown(f'{text}', raw=True)

def make_clickable(val):
    return f'<a href="{val}">{val}</a>'

```python
adv.crawl(
    url_list='https://www.nasa.gov/',
    output_file='nasa_crawl.jl',
    follow_links=True,
    custom_settings={
        'CLOSESPIDER_PAGECOUNT': 1000,
        'LOG_FILE': 'nasa_crawl.log',
        'JOBDIR': 'nasa_crawl',
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 6
    })

# convert from .jl to .parquet:
adv.crawlytics.jl_to_parquet('nasa_crawl.jl', 'nasa_crawl.parquet')
```

In [2]:
crawldf = pd.read_parquet('nasa_crawl.parquet')
crawldf.head(3)

Unnamed: 0,url,title,meta_desc,viewport,charset,h2,h3,canonical,alt_href,og:locale,og:type,og:title,og:description,og:url,og:site_name,og:updated_time,og:image,og:image:secure_url,og:image:width,og:image:height,og:image:alt,og:image:type,og:video,og:video_1,og:video_2,og:video_3,og:video_4,og:video_5,og:video_6,og:video_7,og:video_8,og:video_9,og:video_10,og:video_11,og:video_12,og:video_13,og:video_14,og:video_15,og:video_16,og:video_17,og:video_18,og:video_19,og:video_20,og:video_21,og:video_22,og:video_23,twitter:card,twitter:title,twitter:description,twitter:image,jsonld_@context,jsonld_@graph,body_text,size,download_timeout,download_slot,download_latency,depth,status,links_url,links_text,links_nofollow,nav_links_url,nav_links_text,nav_links_nofollow,header_links_url,header_links_text,header_links_nofollow,footer_links_url,footer_links_text,footer_links_nofollow,img_fetchpriority,img_width,img_height,img_alt,img_src,img_srcset,img_decoding,img_sizes,img_loading,ip_address,crawl_time,resp_headers_Content-Length,resp_headers_Server,resp_headers_Date,resp_headers_Content-Type,resp_headers_Host-Header,resp_headers_X-Launch-Status,resp_headers_Link,resp_headers_X-Rq,resp_headers_Cache-Control,resp_headers_Age,resp_headers_X-Cache,resp_headers_Vary,resp_headers_Accept-Ranges,resp_headers_Strict-Transport-Security,request_headers_Accept,request_headers_Accept-Language,request_headers_User-Agent,request_headers_Accept-Encoding,h1,h4,request_headers_Referer,twitter:label1,twitter:data1,twitter:label2,twitter:data2,redirect_times,redirect_ttl,redirect_urls,redirect_reasons,resp_headers_Last-Modified,resp_headers_Etag,resp_headers_Access-Control-Allow-Origin,resp_headers_Access-Control-Allow-Methods,resp_headers_Content-Disposition,twitter:image:alt,twitter:image:width,twitter:image:height,resp_headers_X-Powered-By,h5,jsonld_@type,jsonld_headline,jsonld_url,jsonld_thumbnailUrl,jsonld_articleSection,jsonld_author,jsonld_creator,jsonld_keywords,jsonld_dateCreated,jsonld_datePublished,jsonld_dateModified,jsonld_mainEntityOfPage.@type,jsonld_mainEntityOfPage.@id,jsonld_image.@type,jsonld_image.url,jsonld_publisher.@type,jsonld_publisher.name,jsonld_publisher.logo,resp_headers_X-Hacker,img_align,resp_headers_X-Content-Type-Options,resp_headers_X-Xss-Protection,resp_headers_Content-Security-Policy,resp_headers_Set-Cookie,resp_headers_Expires,resp_headers_X-Frame-Options,resp_headers_Server-Timing,resp_headers_Timing-Allow-Origin,request_headers_Cookie,resp_headers_X-Amz-Id-2,resp_headers_X-Amz-Request-Id,resp_headers_X-Amz-Replication-Status,resp_headers_X-Amz-Server-Side-Encryption,resp_headers_X-Amz-Meta-Cb-Modifiedtime,resp_headers_X-Amz-Version-Id,resp_headers_X-Ua-Compatible,resp_headers_Referrer-Policy,resp_headers_Via,resp_headers_X-Amz-Cf-Pop,resp_headers_X-Amz-Cf-Id,h6,img_border,img_hspace,img_vspace,resp_headers_Content-Language,resp_headers_Cf-Cache-Status,resp_headers_Cf-Ray,img_ismap,img_usemap
0,https://www.nasa.gov/,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...","width=device-width, initial-scale=1",UTF-8,Suggested Searches@@Martians Wanted@@Featured News@@NASA’s SpaceX Crew-8@@Image Of The Day@@Image Of The Day@@Our Ch...,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",https://www.nasa.gov/,https://www.nasa.gov/feed/@@https://www.nasa.gov/wp-json/wp/v2/pages/128943@@https://www.nasa.gov/wp-json/oembed/1.0...,en_US,website,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...",https://www.nasa.gov/,NASA,2024-02-17T01:14:00-05:00,https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,640.0,512.0,NASA Meatball paint refresh,image/jpeg,https://www.youtube.com/embed/21X5lGlDOfg,https://www.youtube.com/embed/NpHFB_DYXhY,https://www.youtube.com/embed/_LJHRpDvPCw,https://www.youtube.com/embed/bTQjiMtpMG0,https://www.nasa.gov/wp-content/uploads/2023/11/final-nasa-15-sec-horizontal-16-9.mp4,https://www.youtube.com/embed/1fOWosS_f1Y,https://www.youtube.com/embed/31b1yjUBlO0,https://www.youtube.com/embed/MTyzq4ey9RE,https://www.youtube.com/embed/OffTxAiAQfM,https://www.youtube.com/embed/ZbBx4sW68uw,https://www.youtube.com/embed/vUYcQ_ehArw,https://www.youtube.com/embed/YQWespzOtzI,https://www.youtube.com/embed/CRZYw9fEBe4,https://www.youtube.com/embed/IGuHErKAiHs,https://www.youtube.com/embed/R-TOoGTvFL8,https://www.youtube.com/embed/p566jU9pylY,https://www.youtube.com/embed/_tdsia6EZY8,https://www.youtube.com/embed/hW5akI5Rnyg,https://www.youtube.com/embed/WQR_iNjEjlw,https://www.youtube.com/embed/iDAKTLmt2hs,https://www.youtube.com/embed/VwVL0UBVVLA,https://www.youtube.com/embed/Ha4mXufQp6c,https://www.youtube.com/embed/sgp_2OBxKeM,https://www.youtube.com/embed/UyXS2tYggiE,summary_large_image,NASA,"NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e...",https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,299218,180,www.nasa.gov,0.064869,0,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@high@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@eager@@lazy@@@@@@@@lazy@@lazy@@lazy@@laz...,192.0.66.108,2024-02-19 08:46:43,31913.0,nginx,"Mon, 19 Feb 2024 08:46:43 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/"",<https://www.nasa.gov/wp-json/wp/v2/pages/128943>; rel=""al...",hhn1 85 187 443,"max-age=300, must-revalidate",666.0,hit,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,https://www.nasa.gov/?search=SpaceX%20Crew-2,"3522 Search Results for ""SpaceX Crew-2""",,"width=device-width, initial-scale=1",UTF-8,Suggested Searches@@\n\t\t\t\t3522 results found\t\t\t\t\t,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",,https://www.nasa.gov/feed/,en_US,website,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,summary_large_image,,,,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,234115,180,www.nasa.gov,1.301717,1,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",,192.0.66.108,2024-02-19 08:46:47,,nginx,"Mon, 19 Feb 2024 08:46:47 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/""",hhn1 85 187 443,"max-age=300, must-revalidate",0.0,miss,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",\n\t\t\t\t\tSearch Results for: SpaceX Crew-2\t\t\t\t,The SpaceX Freedom Dragon crew ship with the Axiom Mission-2 crew - NASA@@NASA's SpaceX Crew-7@@The SpaceX Freedom D...,https://www.nasa.gov/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,https://www.nasa.gov/?search=International%20Space%20Station,"28773 Search Results for ""International Space Station""",,"width=device-width, initial-scale=1",UTF-8,Suggested Searches@@\n\t\t\t\t28773 results found\t\t\t\t\t,"News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns...",,https://www.nasa.gov/feed/,en_US,website,,,,NASA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,summary_large_image,,,,https://schema.org,"[{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth...",\n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N...,233543,180,www.nasa.gov,2.509646,1,200,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov...,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https...,\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@...,About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@...,False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals...,high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@...,50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@...,NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n...,https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa...,@@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@...,"@@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:...",,192.0.66.108,2024-02-19 08:46:47,,nginx,"Mon, 19 Feb 2024 08:46:47 GMT",text/html; charset=UTF-8,a9130478a60e5f9135f765b23f26593b,Go Flight!,"<https://www.nasa.gov/wp-json/>; rel=""https://api.w.org/""",hhn1 85 188 443,"max-age=300, must-revalidate",0.0,miss,Accept-Encoding,bytes,max-age=31536000,"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",en,advertools/0.14.0,"gzip, deflate, br",\n\t\t\t\t\tSearch Results for: International Space Station\t\t\t\t,International Space Station - NASA@@20 Years of Observing Earth from the International Space Station - NASA@@SpaceX ...,https://www.nasa.gov/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [3]:
md(f'''
### Rows: {crawldf.shape[0]:,}
### Columns: {crawldf.shape[1]}
--- ''')


### Rows: 9,942
### Columns: 170
--- 

## Link summary table
Maps links on all crawled URLs together with anchor text, rel nofollow attribute, and whether or not it's an internal link.

In [8]:
link_df = adv.crawlytics.links(crawldf, internal_url_regex=r'nasa\.gov')
link_df

Unnamed: 0,url,link,text,nofollow,internal
0,https://www.nasa.gov/,https://www.nasa.gov/,\n\t\t\t\t\n\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/,\n\t\t\t\t\n\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/news/,\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t,False,True
0,https://www.nasa.gov/,https://www.nasa.gov/news/all-news/,\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t\t\t\t,False,True
0,https://www.nasa.gov/,https://plus.nasa.gov/series/,\n\t\t\t\t\t\t\t\t\t\tVideo Series on NASA+\n\t\t\t\t\t\t\t\t\t,False,True
...,...,...,...,...,...
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,http://oig.nasa.gov/,Office of the IG,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/budgets-plans-and-reports/,Budget & Annual Reports,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/organizations/budget-annual-reports/agency-financial-reports/,Agency Financial Reports,False,True
9941,https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/,https://www.nasa.gov/contact-nasa/,Contact NASA,False,True


In [13]:
error_urls = crawldf[crawldf['status'].ne(200)][['url', 'status']].rename(columns={'url': 'link'})
error_urls

Unnamed: 0,link,status
16,https://www.nasa.gov/search/?search=SpaceX%20Crew-2&search_page=177,404
17,https://www.nasa.gov/search/?search=International%20Space%20Station&search_page=1439,404
19,https://www.nasa.gov/search/?search=Mars%20perseverance&search_page=289,404
21,https://www.nasa.gov/search/?search=Expedition%2064&search_page=78,404
25,https://www.nasa.gov/search/?search=Expedition%2064&search_page=3,404
...,...,...
9837,https://www.nasa.gov/nasa-directorates/armd/advanced-air-vehicles-program/hi-rate-composite-aircraft-manufacturing-p...,404
9868,https://www.nasa.gov/jpl/news/ed-stone-colbert-20131204.html,404
9869,https://www.nasa.gov/aeroresearch/stem/seeing-sound,404
9870,https://www.nasa.gov/qsf18,404


In [15]:
broken = pd.merge(
    link_df[link_df['link'].isin(error_urls['link'])],
    error_urls,
    left_on='link',
    right_on='link',
    how='left'
)
broken

Unnamed: 0,url,link,text,nofollow,internal,status
0,https://www.nasa.gov/?search=SpaceX%20Crew-2,https://www.nasa.gov/search/?search=SpaceX%20Crew-2&search_page=177,177,False,True,404
1,https://www.nasa.gov/?search=International%20Space%20Station,https://www.nasa.gov/search/?search=International%20Space%20Station&search_page=1439,1439,False,True,404
2,https://www.nasa.gov/?search=Mars%20perseverance,https://www.nasa.gov/search/?search=Mars%20perseverance&search_page=289,289,False,True,404
3,https://www.nasa.gov/?search=Expedition%2064,https://www.nasa.gov/search/?search=Expedition%2064&search_page=3,3,False,True,404
4,https://www.nasa.gov/?search=Expedition%2064,https://www.nasa.gov/search/?search=Expedition%2064&search_page=78,78,False,True,404
...,...,...,...,...,...,...
4128,https://www.nasa.gov/feature/goddard/2018/nasa-gold-mission-to-image-earth-s-interface-to-space,https://www.nasa.gov/feature/goddard/2018/nasa-gold-mission-to-image-earth-s-interface-to-space,\n\t\t\t\t\tNews & Events\n\t\t\t\t\t\n\t\t\t\t,False,True,404
4129,https://www.nasa.gov/feature/goddard/2018/nasa-gold-mission-to-image-earth-s-interface-to-space,https://www.nasa.gov/feature/goddard/2018/nasa-gold-mission-to-image-earth-s-interface-to-space,\n\t\t\t\t\tMultimedia\n\t\t\t\t\t\n\t\t\t\t,False,True,404
4130,https://www.nasa.gov/image-article/nasa-going-green-space/,https://www.nasa.gov/spacex,technology missions,False,True,404
4131,https://www.nasa.gov/image-article/nasa-going-green-space/,https://www.nasa.gov/spacex,Space Test Program-2,False,True,404


## Cleaning up:

* There are many 404 pages linked to from the final pagination links on search results
* There are also many 404s where not-found pages link to themselves. Once we handle these we automatically remove the broken links (to themselves)
* The final broken DataFrame is the same but by filtering out URLs with "search=" and by removing rows where `url` is equal to `link` (a reduction from 4,133 to 1,867 rows)

In [64]:
broken[(~broken['url'].str.contains('search=')) & broken['url'].ne(broken['link'])]

Unnamed: 0,url,link,text,nofollow,internal,status
5,https://www.nasa.gov/audio-and-ringtones/,https://www.nasa.gov/590319main_ringtone_apollo11_countdown.m4r,Apollo 11: We Have a Lift-Off,False,True,404
6,https://www.nasa.gov/audio-and-ringtones/,https://www.nasa.gov/connect/sounds/iphone_install_directions.html,Please visit our iPhone directions for downloading and installing M4R ringtones page.,False,True,404
28,https://www.nasa.gov/centers-and-facilities/ames/nasa-study-reveals-compounding-climate-risks-at-two-degrees-of-warm...,https://www.nasa.gov/nex/gddp,found online,False,True,404
29,https://www.nasa.gov/centers-and-facilities/ames/nasa-study-reveals-compounding-climate-risks-at-two-degrees-of-warm...,https://www.nasa.gov/nex/gddp,NEX downscaled dataset,False,True,404
44,https://www.nasa.gov/early-stage-innovations-esi/,https://www.nasa.gov/directorates/spacetech/strg/2012_stro_esi_selections.html#.U3IgSySnw0U,ESI 2012,False,True,404
...,...,...,...,...,...,...
4120,https://www.nasa.gov/centers-and-facilities/johnson/nasa-goes-quiet-over-galveston-for-flight-series/,https://www.nasa.gov/aeroresearch/stem/seeing-sound,“Seeing Sound” learning module,False,True,404
4127,https://www.nasa.gov/solar-system/why-nasa-watches-airglow-the-colors-of-the-upper-atmospheric-wind/,https://www.nasa.gov/feature/goddard/2018/nasa-gold-mission-to-image-earth-s-interface-to-space,GOLD Images Earth’s Interface to Space,False,True,404
4130,https://www.nasa.gov/image-article/nasa-going-green-space/,https://www.nasa.gov/spacex,technology missions,False,True,404
4131,https://www.nasa.gov/image-article/nasa-going-green-space/,https://www.nasa.gov/spacex,Space Test Program-2,False,True,404
