In [16]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Blog Scraping Notebook

In [23]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from textstat.textstat import textstat
import lxml

In [24]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
}

## Scraping One Article

Show how to extract out header content


In [25]:
url = 'https://blog.frame.io/2021/11/15/color-grading-strategies-for-client-notes/'
r = requests.get(url, headers=headers)

In [26]:
r.status_code

200

In [27]:
html = r.text.strip()
print(html)

<!DOCTYPE html>

<script>
 var fioPageType = ''; 
var fioPageSlug = ''; 
fioPageType = 'article';
 fioPageSlug = 'color-grading-strategies-for-client-notes';
 </script>
<html lang="en-US">
	<head>
		<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">

		<link rel="profile" href="//gmpg.org/xfn/11">

		

		<meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1' />

	<!-- This site is optimized with the Yoast SEO plugin v16.6 - https://yoast.com/wordpress/plugins/seo/ -->
	<title>Essential Color Grading Skills: 8 Strategies for Managing Client Notes</title>
	<meta name="description" content="Veteran colorist Cullen Kelly offers 8 lessons on dealing with common client requests like &quot;make it pop&quot; and &quot;the colors don&#039;t match.&quot;" />
	<link rel="canonical" href="https://blog.frame.io/2021/11/15/color-grading-strategies-for-client-notes/" />
	<meta property="og:locale" content="

In [29]:
soup = BeautifulSoup(html, 'html.parser')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [30]:
# Header Content
header = soup.find(class_='entry-header')
title_html = header.find(class_='post-meta-title')
print(title_html)

<h1 class="post-meta-title">
								Essential Color Grading Skills: 8 Strategies for Managing Client Notes							</h1>


In [31]:
title_html.contents

['\r\n\t\t\t\t\t\t\t\tEssential Color Grading Skills: 8 Strategies for Managing Client Notes\t\t\t\t\t\t\t']

In [32]:
title_str = title_html.contents[0].strip()
print(title_str)

Essential Color Grading Skills: 8 Strategies for Managing Client Notes


In [33]:
author_html = header.find(class_='author-name')
author_str = author_html.find('a').contents[0].strip()
print(author_str)

Cullen Kelly


### Modularize Code

In [34]:
def parse_page(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'html.parser')
    
    # Header Content
    header = soup.find(class_='entry-header')
    read_time = extract_read_time(header)
    title = extract_title(header)

    author = extract_author(header)
    categories = extract_categories(header)

    date = extract_date(header)
    dt = parser.parse(date)
    month = dt.strftime("%B")
    weekday = dt.strftime("%A")
    
    # Body Content
    content = soup.find(class_='entry-content')
    word_count = len(content.text.split())
    reading_level = textstat.flesch_kincaid_grade(content.text)

    links = content.find_all("a")
    link_count = len(links)

    images = content.find_all("img")
    image_count = len(images)
    
    page_data = {
        'reading_time' : read_time,
        'title': title,
        'date': date,
        'month': month,
        'weekday': weekday,
        'author': author,
        'categories': categories,
        'word_count': word_count,
        'reading_level': reading_level,
        'link_count': link_count,
        'image_count': image_count
    }
    
    return page_data
    
def extract_read_time(header):
    html_str = header.find(class_='read-time')
    time_str = html_str.contents[0].strip().lower().split()[0]
    time_int = int(time_str)
    return time_int

def extract_title(header):
    html_str = header.find(class_='post-meta-title')
    title_str = html_str.contents[0].strip()
    return title_str

def extract_date(header):
    html_str = header.find(class_='single-post-date')
    date_str = html_str.contents[0].strip()
    return date_str

def extract_author(header):
    html_str = header.find(class_='author-name')
    author_str = html_str.find('a').contents[0].strip()
    return author_str

def extract_categories(header):
    html_str = header.find(class_='single-post-cat')
    categories = html_str.findAll('a')
    cat_names = []
    for cat_link in categories:
        cat_name = cat_link.contents[0].strip().lower()
        cat_names.append(cat_name)
    return cat_names

In [35]:
url = 'https://blog.frame.io/2021/11/15/color-grading-strategies-for-client-notes/'
wmn_exp = parse_page(url)

In [36]:
print(wmn_exp)

{'reading_time': 14, 'title': 'Essential Color Grading Skills: 8 Strategies for Managing Client Notes', 'date': 'November 15, 2021', 'month': 'November', 'weekday': 'Monday', 'author': 'Cullen Kelly', 'categories': ['business', 'color'], 'word_count': 2743, 'reading_level': 10.9, 'link_count': 6, 'image_count': 4}


## Scraping One Category

In [37]:
articles_store = []

In [38]:
def parse_category(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'html.parser')
    
    article_cards = soup.findAll(class_='post-content')

    for article in article_cards:
        title = article.find(class_='post-meta-title')
        link = title.contents[0]['href']
        print('Parsing URL:', link)
        page = parse_page(link)
        articles_store.append(page)
        
    next_link = find_next_link(soup)
    
    if next_link is not None:
        print('Next page:', next_link)
        parse_category(next_link)
        
    return None

def find_next_link(soup_item):
    bottom_nav = soup_item.find(class_='navigation')
    
    if bottom_nav == None:
        return None
    
    links = bottom_nav.findAll('a')
    next_page = links[-1]

    if next_page.contents[0] == 'Next':
        next_link = next_page['href']
        return next_link
    
    return None

In [39]:
bts = 'https://blog.frame.io/category/behind-the-scenes/'
parse_category(bts)

Parsing URL: https://blog.frame.io/2021/11/10/art-of-the-cut-edgar-wright-last-night-in-soho/
Parsing URL: https://blog.frame.io/2021/11/03/art-of-the-cut-andy-weisblum-wes-anderson-the-french-dispatch/
Parsing URL: https://blog.frame.io/2021/10/27/art-of-the-cut-dune-joe-walker/
Parsing URL: https://blog.frame.io/2021/10/20/art-of-the-cut-the-harder-they-fall/
Parsing URL: https://blog.frame.io/2021/10/13/art-of-the-cut-james-bond-no-time-to-die/
Parsing URL: https://blog.frame.io/2021/10/12/made-in-frame-yeti-content-marketing/
Parsing URL: https://blog.frame.io/2021/10/06/art-of-the-cut-the-many-saints-of-newark/
Parsing URL: https://blog.frame.io/2021/10/04/made-in-frame-old-fast-glass-lens-test-remote-workflow/
Parsing URL: https://blog.frame.io/2021/09/29/art-of-the-cut-britney-spears-documentary/
Parsing URL: https://blog.frame.io/2021/09/22/art-of-the-cut-summer-of-soul/
Parsing URL: https://blog.frame.io/2021/09/15/art-of-the-cut-shang-chi-legend-ten-rings/
Parsing URL: https:

Parsing URL: https://blog.frame.io/2019/03/18/nicholas-monsour-us/
Next page: https://blog.frame.io/category/behind-the-scenes/page/9/
Parsing URL: https://blog.frame.io/2019/02/18/tom-cross-editing-first-man/
Parsing URL: https://blog.frame.io/2019/02/14/editing-widows/
Parsing URL: https://blog.frame.io/2019/02/11/editing-buster-scruggs/
Parsing URL: https://blog.frame.io/2019/02/04/adam-gough-roma/
Parsing URL: https://blog.frame.io/2019/01/28/mi6-editorial-crew/
Parsing URL: https://blog.frame.io/2018/12/17/made-in-frame-gizmodo/
Parsing URL: https://blog.frame.io/2018/11/19/made-in-frame-cinelab/
Parsing URL: https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/
Parsing URL: https://blog.frame.io/2018/09/10/made-in-frame-searching/
Parsing URL: https://blog.frame.io/2018/07/30/inside-mission-impossible-fallout/
Parsing URL: https://blog.frame.io/2018/07/16/made-in-frame-film-riot/
Parsing URL: https://blog.frame.io/2018/07/02/bts-hotel-artemis/
Next page: https

In [41]:
print(len(articles_store))
print(articles_store[0])

122
{'reading_time': 24, 'title': 'Art of the Cut: Inside Edgar Wright’s “Last Night in Soho”', 'date': 'November 10, 2021', 'month': 'November', 'weekday': 'Wednesday', 'author': 'Steve Hullfish', 'categories': ['art of the cut'], 'word_count': 4797, 'reading_level': 11.1, 'link_count': 4, 'image_count': 21}


## Scraping All Categories

In [42]:
articles_store = []

In [43]:
categories = ['post-production', 'color-correction', 'business', 'workflow', 'behind-the-scenes', 'production', 'announcement']

In [44]:
for category in categories:
    url = 'https://blog.frame.io/category/' + category + '/'
    print('Parsing category', category)
    parse_category(url)

Parsing category post-production
Parsing URL: https://blog.frame.io/2021/11/08/work-faster-in-davinci-resolve/
Parsing URL: https://blog.frame.io/2021/10/18/edit-faster-premiere-pro-keyboard-shortcuts/
Parsing URL: https://blog.frame.io/2021/09/27/davinci-resolve-ai-tools/
Parsing URL: https://blog.frame.io/2021/09/20/adobe-premiere-pro-speech-to-text/
Parsing URL: https://blog.frame.io/2021/08/23/aces-after-effects-resolve-workflow/
Parsing URL: https://blog.frame.io/2021/08/16/premiere-pro-audio-mixing-basics/
Parsing URL: https://blog.frame.io/2021/07/22/art-of-the-cut-werner-herzog-fireball-documentary/
Parsing URL: https://blog.frame.io/2021/06/14/photogrammetry-future-of-filmmaking/
Parsing URL: https://blog.frame.io/2021/06/07/premiere-pro-max-render-quality-max-bit-depth/
Parsing URL: https://blog.frame.io/2021/05/17/made-in-frame-what-drives-us-dave-grohl/
Parsing URL: https://blog.frame.io/2021/05/03/made-in-frame-wolfwalkers/
Parsing URL: https://blog.frame.io/2021/04/28/mad

Parsing URL: https://blog.frame.io/2017/10/25/4-reality-tv-editing-techniques/
Parsing URL: https://blog.frame.io/2017/10/23/7-prep-mistakes-to-avoid/
Parsing URL: https://blog.frame.io/2017/10/16/fcpx-magnetic-timeline/
Parsing URL: https://blog.frame.io/2017/10/11/better-audio-in-premiere/
Parsing URL: https://blog.frame.io/2017/10/09/prepping-social-media-video/
Parsing URL: https://blog.frame.io/2017/10/04/turbo-charge-fcpx-workflow-davinci-resolve/
Parsing URL: https://blog.frame.io/2017/10/02/speeding-up-stringouts-keyboard-maestro/
Parsing URL: https://blog.frame.io/2017/09/13/give-fcpx-second-look/
Parsing URL: https://blog.frame.io/2017/09/11/how-to-speed-up-premiere-pro-exports/
Next page: https://blog.frame.io/category/post-production/page/10/
Parsing URL: https://blog.frame.io/2017/09/06/davinci-resolve-may-be-most-powerful/
Parsing URL: https://blog.frame.io/2017/08/23/editing-laughter-love-loneliness/
Parsing URL: https://blog.frame.io/2017/08/21/editor-as-writer/
Parsing

Parsing URL: https://blog.frame.io/2021/03/08/hdr-whitepaper-preserving-creative-intent/
Parsing URL: https://blog.frame.io/2021/03/01/frameio-c2c-dailies-workflows/
Parsing URL: https://blog.frame.io/2021/02/11/frameio-c2c-technical-guide/
Parsing URL: https://blog.frame.io/2021/01/29/5g-video-production-white-paper/
Parsing URL: https://blog.frame.io/2021/01/25/remote-workflows-in-final-cut-pro-white-paper/
Parsing URL: https://blog.frame.io/2021/01/11/how-to-develop-your-own-style-as-a-colorist/
Next page: https://blog.frame.io/category/workflow/page/2/
Parsing URL: https://blog.frame.io/2020/12/14/explained-remote-asset-sharing-frame-io-transfer-app/
Parsing URL: https://blog.frame.io/2020/11/30/explained-workflow-security-watermark-id/
Parsing URL: https://blog.frame.io/2020/11/16/color-grading-lessons-from-celluloid-film/
Parsing URL: https://blog.frame.io/2020/10/19/fcp-x-10-4-9-remote-proxy-editing/
Parsing URL: https://blog.frame.io/2020/10/12/5-reasons-to-care-about-hdr/
Pars

Parsing URL: https://blog.frame.io/2021/11/10/art-of-the-cut-edgar-wright-last-night-in-soho/
Parsing URL: https://blog.frame.io/2021/11/03/art-of-the-cut-andy-weisblum-wes-anderson-the-french-dispatch/
Parsing URL: https://blog.frame.io/2021/10/27/art-of-the-cut-dune-joe-walker/
Parsing URL: https://blog.frame.io/2021/10/20/art-of-the-cut-the-harder-they-fall/
Parsing URL: https://blog.frame.io/2021/10/13/art-of-the-cut-james-bond-no-time-to-die/
Parsing URL: https://blog.frame.io/2021/10/12/made-in-frame-yeti-content-marketing/
Parsing URL: https://blog.frame.io/2021/10/06/art-of-the-cut-the-many-saints-of-newark/
Parsing URL: https://blog.frame.io/2021/10/04/made-in-frame-old-fast-glass-lens-test-remote-workflow/
Parsing URL: https://blog.frame.io/2021/09/29/art-of-the-cut-britney-spears-documentary/
Parsing URL: https://blog.frame.io/2021/09/22/art-of-the-cut-summer-of-soul/
Parsing URL: https://blog.frame.io/2021/09/15/art-of-the-cut-shang-chi-legend-ten-rings/
Parsing URL: https:

Parsing URL: https://blog.frame.io/2019/03/18/nicholas-monsour-us/
Next page: https://blog.frame.io/category/behind-the-scenes/page/9/
Parsing URL: https://blog.frame.io/2019/02/18/tom-cross-editing-first-man/
Parsing URL: https://blog.frame.io/2019/02/14/editing-widows/
Parsing URL: https://blog.frame.io/2019/02/11/editing-buster-scruggs/
Parsing URL: https://blog.frame.io/2019/02/04/adam-gough-roma/
Parsing URL: https://blog.frame.io/2019/01/28/mi6-editorial-crew/
Parsing URL: https://blog.frame.io/2018/12/17/made-in-frame-gizmodo/
Parsing URL: https://blog.frame.io/2018/11/19/made-in-frame-cinelab/
Parsing URL: https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/
Parsing URL: https://blog.frame.io/2018/09/10/made-in-frame-searching/
Parsing URL: https://blog.frame.io/2018/07/30/inside-mission-impossible-fallout/
Parsing URL: https://blog.frame.io/2018/07/16/made-in-frame-film-riot/
Parsing URL: https://blog.frame.io/2018/07/02/bts-hotel-artemis/
Next page: https

Parsing URL: https://blog.frame.io/2016/08/18/how-to-move-everything-but-the-camera/
Next page: https://blog.frame.io/category/production/page/7/
Parsing URL: https://blog.frame.io/2016/08/12/improve-visual-storytelling-daily/
Parsing URL: https://blog.frame.io/2016/08/01/180-degree-rule-and-when-to-break-it/
Parsing URL: https://blog.frame.io/2016/07/26/one-essential-principle-create-indelible-slow-motion/
Parsing URL: https://blog.frame.io/2016/02/04/cast-great-actors-by-holding-better-auditions/
Parsing URL: https://blog.frame.io/2016/01/28/how-to-make-great-storyboards-even-if-you-cant-draw/
Parsing URL: https://blog.frame.io/2016/01/25/6-photographers-who-will-make-you-a-better-filmmaker/
Parsing URL: https://blog.frame.io/2016/01/18/6-steps-to-writing-powerful-loglines/
Parsing URL: https://blog.frame.io/2015/12/31/say-yes-to-vertical-video/
Parsing URL: https://blog.frame.io/2015/12/28/how-to-launch-your-career-with-a-jaw-dropping-spec-piece/
Parsing URL: https://blog.frame.io/2

In [45]:
len(articles_store)

498

In [46]:
articles_store[0]

{'reading_time': 16,
 'title': 'How To Work Faster in DaVinci Resolve (Without Upgrading Your Hardware)',
 'date': 'November 8, 2021',
 'month': 'November',
 'weekday': 'Monday',
 'author': 'Dan Swierenga',
 'categories': ['davinci resolve', 'post-production'],
 'word_count': 3202,
 'reading_level': 10.1,
 'link_count': 13,
 'image_count': 15}

In [47]:
import json

with open('data/articles.json', 'w') as f:
    json.dump(articles_store, f)