In [6]:
import pandas as pd
import requests
import xmltodict
from bs4 import BeautifulSoup

In [7]:
import sys

sys.path.append('..')

from logger import logger

## Scrape essays from paulgraham.com

In [4]:
sitemap = 'http://www.aaronsw.com/2002/feeds/pgessays.rss'
r = requests.get(sitemap)
xml = r.text
xml_dict = xmltodict.parse(xml)

In [114]:
sample_url = 'http://www.paulgraham.com/hwh.html'
html = requests.get(sample_url).text
soup = BeautifulSoup(html, 'html.parser')

In [116]:
text = soup.get_text()

In [53]:
def extract_text_from_url(url, min_line_length=5):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get text
    text = soup.get_text()
    
    # Remove newlines
    lines = (line.strip() for line in text.splitlines() if line)
    
    # Remove lines that are too short
    lines = (line for line in lines if len(line) > min_line_length)
    
    # Return text
    return '\n'.join(line for line in lines if line)

In [64]:
pages = []

for _dict in xml_dict['rss']['channel']['item']:
    logger.info(f'Title: {_dict["title"]}, url: {_dict["link"]}')
    pages.append({'title': _dict['title'], 'url': _dict['link'], 'text': extract_text_from_url(_dict['link'])})

2023-03-27 09:00:49,970 - Title: How to Get New Ideas, url: http://www.paulgraham.com/getideas.html
2023-03-27 09:00:50,409 - Title: The Need to Read, url: http://www.paulgraham.com/read.html
2023-03-27 09:00:50,644 - Title: What You (Want to)* Want, url: http://www.paulgraham.com/want.html
2023-03-27 09:00:50,879 - Title: Alien Truth, url: http://www.paulgraham.com/alien.html
2023-03-27 09:00:51,102 - Title: What I've Learned from Users, url: http://www.paulgraham.com/users.html
2023-03-27 09:00:51,339 - Title: Heresy, url: http://www.paulgraham.com/heresy.html
2023-03-27 09:00:51,531 - Title: Putting Ideas into Words, url: http://www.paulgraham.com/words.html
2023-03-27 09:00:51,789 - Title: Is There Such a Thing as Good Taste?, url: http://www.paulgraham.com/goodtaste.html
2023-03-27 09:00:52,085 - Title: Beyond Smart, url: http://www.paulgraham.com/smart.html
2023-03-27 09:00:52,337 - Title: Weird Languages, url: http://www.paulgraham.com/weird.html
2023-03-27 09:00:52,645 - Title:

In [82]:
df = pd.DataFrame(pages)
df.to_parquet('../data/paulgraham.parquet', compression='gzip')

## Scrape essays from lethain.com

In [3]:
sitemap = 'https://lethain.com/sitemap.xml'
r = requests.get(sitemap)
xml = r.text
xml_dict = xmltodict.parse(xml)

In [4]:
def extract_text_from_url(url, min_line_length=5, last_paragraph="Hi folks. I'm Will aka @lethain"):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get text
    lines = []

    _paragraphs = soup.find_all('p')

    for p in _paragraphs:
        text = p.get_text()
        if last_paragraph in text:
            break
        lines.append(text)
    
    # Remove newlines
    lines = (line.strip() for line in lines if line)
    
    # Remove lines that are too short
    lines = (line for line in lines if len(line) > min_line_length)
    
    # Return text
    return '\n'.join(line for line in lines if line)

In [5]:
print(extract_text_from_url('https://lethain.com/building-exec-network/'))

In most of my roles, I’ve learned more from my peers than from my manager. Even when you get along well with your manager, your peers’ perspective will usually be closer to yours than your manager’s. Once you transition into an engineering executive role, you’ll still have peers, but they’re a different sort of peer, who will look at problems from a very different perspective than yours. If you ask the head of product for feedback, they will give it, but it’ll come from a product perspective. This will make your peer executives’ feedback valuable, but valuable in a very different way than peer engineering leader feedback you’ll have gotten in previous roles.
When I started my first engineering executive role, I spent time building a learning circle of industry peers, which was fundamental in my success.
Whenever I got stuck, I was able to quickly poll their perspectives, and find new ideas to address my problem.
Whether you build a learning community or rely on cold outreach,
building 

In [6]:
pages = []

for _dict in xml_dict['urlset']['url']:
    url = _dict['loc']
    lastmod = _dict.get('lastmod', None)
    if 'https://lethain.com/tags/' in url:
        continue
    if not lastmod:
        continue
    
    logger.info(f'Url: {url}, lastmod: {lastmod}')
    pages.append({'title': 'NA', 'url': url, 'text': extract_text_from_url(url)})

2023-03-27 19:24:07,481 - Url: https://lethain.com/, lastmod: 2023-03-27T06:00:00-06:00
2023-03-27 19:24:07,828 - Url: https://lethain.com/onboarding-peer-executives/, lastmod: 2023-03-27T06:00:00-06:00
2023-03-27 19:24:07,984 - Url: https://lethain.com/posts/, lastmod: 2023-03-27T06:00:00-06:00
2023-03-27 19:24:08,294 - Url: https://lethain.com/leaving-the-executive-job/, lastmod: 2023-03-20T06:00:00-06:00
2023-03-27 19:24:08,455 - Url: https://lethain.com/planning/, lastmod: 2023-03-14T06:00:00-06:00
2023-03-27 19:24:08,598 - Url: https://lethain.com/using-cultural-survey-data/, lastmod: 2023-03-13T06:00:00-06:00
2023-03-27 19:24:08,754 - Url: https://lethain.com/engineering-onboarding-programs/, lastmod: 2023-03-06T11:00:00-06:00
2023-03-27 19:24:08,905 - Url: https://lethain.com/engineering-in-mergers-and-acquisition/, lastmod: 2023-02-27T06:00:00-06:00
2023-03-27 19:24:09,062 - Url: https://lethain.com/building-exec-network/, lastmod: 2023-02-21T06:00:00-06:00
2023-03-27 19:24:09,

In [7]:
df = pd.DataFrame(pages)

# Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df = df[df['text_len'] > 1000]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {'https://lethain.com/posts/', 
                 'https://lethain.com/',
                 'https://lethain.com/about/'}
df = df[~df['url'].isin(excluded_urls)]

In [9]:

# df.sort_values('text_len', ascending=False).head(10)

In [10]:
df.to_parquet('../data/willlarson.parquet', compression='gzip')

## Scrape data from https://charity.wtf/sitemap-1.xml

In [176]:
sitemap = 'https://charity.wtf/sitemap-1.xml'
r = requests.get(sitemap)
xml = r.text
xml_dict = xmltodict.parse(xml)

In [17]:
sample_url = 'https://charity.wtf/2019/01/04/engineering-management-the-pendulum-or-the-ladder/'
html = requests.get(sample_url).text
soup = BeautifulSoup(html, 'html.parser')

In [22]:
# Get text
lines = []

_paragraphs = soup.find_all('p')

for p in _paragraphs:
    text = p
    # if last_paragraph in text:
    #     break
    lines.append(text)

In [23]:
lines

[<p>Last night I was out with a dear friend who has been an engineering manager for a year now, and by two drinks in I was rattling off a long list things I <em>always</em> say to newer engineering managers.</p>,
 <p>Then I remembered: I should write a post! It’s one of my goals this year to write more long form instead of just twittering off into the abyss.<img alt="Buffy Jaguar 3.5x5" class="wp-image-5147 alignright jetpack-lazy-image" data-attachment-id="5147" data-comments-opened="1" data-image-caption="" data-image-description="" data-image-meta='{"aperture":"0","credit":"","camera":"","caption":"","created_timestamp":"0","copyright":"","focal_length":"0","iso":"0","shutter_speed":"0","title":"","orientation":"0"}' data-image-title="Buffy Jaguar 3.5×5" data-large-file="https://i0.wp.com/charity.wtf/wp-content/uploads/2019/01/Buffy-Jaguar-3.5x5.png?fit=375%2C450&amp;ssl=1" data-lazy-src="https://i0.wp.com/charity.wtf/wp-content/uploads/2019/01/Buffy-Jaguar-3.5x5.png?resize=203%2C24

In [19]:
print(extract_text_from_url(sample_url, last_paragraph='[…]'))

Last night I was out with a dear friend who has been an engineering manager for a year now, and by two drinks in I was rattling off a long list things I always say to newer engineering managers.
Then I remembered: I should write a post! It’s one of my goals this year to write more long form instead of just twittering off into the abyss.
There’s a piece I wrote two years ago, The Engineer/Manager Pendulum,  which is probably my all time favorite.  It was a love letter to a friend who I desperately wanted to see go back to engineering, for his own happiness and mental health.  Well, this piece is a sequel to that one.
It’s primarily aimed at new managers, who aren’t sure what their career options look like or how to evaluate the opportunities that come their way, or how it may expand or shrink their future opportunities.
Every manager reaches a point where they need to choose: do they want to manage engineers (a “line manager”), or do they want to try to climb the org chart? — manage man

In [190]:
pages = []

for _dict in xml_dict['urlset']['url']:
    url = _dict['loc']
    lastmod = _dict.get('lastmod', None)
    if not lastmod:
        continue
    
    logger.info(f'Url: {url}, lastmod: {lastmod}')
    pages.append({'title': 'NA', 'url': url, 'text': extract_text_from_url(url, last_paragraph='[…]')})


2023-03-27 10:34:22,376 - Url: https://charity.wtf/about/, lastmod: 2022-11-24T12:43:58Z
2023-03-27 10:34:22,991 - Url: https://charity.wtf/2015/12/27/hello-world/, lastmod: 2015-12-27T03:57:37Z
2023-03-27 10:34:23,348 - Url: https://charity.wtf/2015/12/29/2015-what-happened/, lastmod: 2015-12-30T21:58:37Z
2023-03-27 10:34:23,765 - Url: https://charity.wtf/2016/02/03/how-to-survive-an-acquisition/, lastmod: 2017-08-28T14:43:57Z
2023-03-27 10:34:24,294 - Url: https://charity.wtf/2016/02/23/two-weeks-with-terraform/, lastmod: 2020-04-29T20:17:07Z
2023-03-27 10:34:24,803 - Url: https://charity.wtf/2016/03/23/aws-networking-environments-and-you/, lastmod: 2017-10-27T06:31:42Z
2023-03-27 10:34:25,355 - Url: https://charity.wtf/2016/03/30/terraform-vpc-and-why-you-want-a-tfstate-file-per-env/, lastmod: 2019-01-16T08:31:00Z
2023-03-27 10:34:26,131 - Url: https://charity.wtf/2016/04/04/nail-polish-the-superior-paint/, lastmod: 2016-05-11T15:27:10Z
2023-03-27 10:34:26,652 - Url: https://charity

In [203]:
df = pd.DataFrame(pages)

# # Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df = df[df['text_len'] > 1000]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {'https://charity.wtf/about/'}
df = df[~df['url'].isin(excluded_urls)]

In [204]:
df.sort_values('text_len', ascending=True).head(10)

Unnamed: 0,title,url,text,text_len
16,,https://charity.wtf/2018/03/02/money-power-and...,I don’t really do “women stuff” (awkward umbre...,1133
1,,https://charity.wtf/2015/12/27/hello-world/,“Start a blog” has been on my todo list for so...,1356
36,,https://charity.wtf/2019/10/30/a-managers-bill...,"Over a year and a half ago, I wrote up a post ...",1971
61,,https://charity.wtf/2021/04/16/questionable-ad...,I recently received this gem of a note::\nHi C...,2186
71,,https://charity.wtf/2022/08/15/giving-good-fee...,"You work with someone great. If someone asked,...",2473
48,,https://charity.wtf/2020/09/02/questionable-ad...,My company has recently begun pushing for us t...,2529
18,,https://charity.wtf/2018/03/30/an-engineers-bi...,Power has a way of flowing towards people mana...,2553
56,,https://charity.wtf/2021/02/12/questionable-ad...,Some interesting followup questions arose from...,2949
7,,https://charity.wtf/2016/04/04/nail-polish-the...,Here is a thing that more people need to know:...,3056
44,,https://charity.wtf/2020/05/30/trolley-problem...,Consider:\nIt’s not getting any simpler to liv...,3235


In [205]:
df.to_parquet('../data/charitymajors.parquet', compression='gzip')

## Scrapping from https://nav.al/sitemap-1.xml

In [216]:
sitemap = 'https://nav.al/sitemap-1.xml'
r = requests.get(sitemap)
xml = r.text
xml_dict = xmltodict.parse(xml)

In [227]:
sample_url = 'https://nav.al/david-deutsch'
html = requests.get(sample_url).text
soup = BeautifulSoup(html, 'html.parser')

In [234]:
print(extract_text_from_url('https://nav.al/rich', last_paragraph='Modal body text goes here.'))

Dec 28 2019
A collection of all my interviews about my ‘How to Get Rich’ tweetstorm.
Wealth is assets that earn while you sleep
Naval is a prolific tech investor and founder of AngelList
Nivi: You probably know Naval from his Twitter account.
We’re going to talk about his tweetstorm, “How To Get Rich (without getting lucky).” We’ll go through most of the tweets in detail, give Naval a chance to expand on them and generally riff on the topic. He’ll probably throw in ideas he hasn’t published before.
Naval’s the co-founder of AngelList and Epinions. He’s also a prolific tech investor in companies like Twitter, Uber and many more.
I’m the co-founder of AngelList with Naval. And I co-authored the Venture Hacks blog with him back in the day.
Naval: The “How to Get Rich” tweetstorm definitely hit a nerve and went viral. A lot of people say it was helpful and reached across aisles.
People outside of the tech industry—people in all walks of life—want to know how to solve their money problems. 

In [235]:
pages = []

for _dict in xml_dict['urlset']['url']:
    url = _dict['loc']
    lastmod = _dict.get('lastmod', None)
    if not lastmod:
        continue
    
    logger.info(f'Url: {url}, lastmod: {lastmod}')
    pages.append({'title': 'NA', 'url': url, 'text': extract_text_from_url(url, last_paragraph='Modal body text goes here.')})

2023-03-27 10:50:07,287 - Url: https://nav.al/american-culture, lastmod: 2018-01-27T22:53:15Z
2023-03-27 10:50:07,712 - Url: https://nav.al/hong-kong-is-civilized, lastmod: 2012-02-09T13:18:16Z
2023-03-27 10:50:08,173 - Url: https://nav.al/natural-beauty, lastmod: 2012-02-09T09:14:54Z
2023-03-27 10:50:08,586 - Url: https://nav.al/do-animals-laugh, lastmod: 2006-01-23T20:45:00Z
2023-03-27 10:50:08,947 - Url: https://nav.al/unquantifiable-risk, lastmod: 2020-01-01T03:05:53Z
2023-03-27 10:50:09,378 - Url: https://nav.al/securitize-citizenship, lastmod: 2012-02-09T13:01:23Z
2023-03-27 10:50:09,841 - Url: https://nav.al/the-80-hour-myth, lastmod: 2019-09-24T21:31:06Z
2023-03-27 10:50:10,290 - Url: https://nav.al/lawyers-or-insurance-salesmen, lastmod: 2012-02-09T12:53:13Z
2023-03-27 10:50:10,653 - Url: https://nav.al/vc-bundling, lastmod: 2012-02-09T12:56:28Z
2023-03-27 10:50:11,145 - Url: https://nav.al/how-microsoft-can-obliterate-google, lastmod: 2019-09-22T03:53:44Z
2023-03-27 10:50:11,

In [246]:
df = pd.DataFrame(pages)

# # Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df = df[df['text_len'] > 500]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {'https://charity.wtf/about/'}
df = df[~df['url'].isin(excluded_urls)]

In [247]:
df.to_parquet('../data/naval.parquet', compression='gzip')

In [264]:
check = pd.read_parquet('../data/naval.parquet')
check.head()

Unnamed: 0,title,url,text
0,,https://nav.al/american-culture,Nov 29 2005\nA caucasian and U.S. born friend ...
1,,https://nav.al/hong-kong-is-civilized,"Nov 29 2005\nFree wifi in the airport, need I ..."
2,,https://nav.al/natural-beauty,Nov 29 2005\nLooking out of an airplane window...
4,,https://nav.al/unquantifiable-risk,Nov 29 2005\nA lot of the Web 2.0 startups get...
5,,https://nav.al/securitize-citizenship,Nov 29 2005\nPeople hate immigration and immig...


## Scrape data from https://pmarchive.com/

In [309]:
sample_url = 'https://pmarchive.com'
html = requests.get(sample_url).text
soup = BeautifulSoup(html, 'html.parser')

In [310]:
urls = []

for a in soup.find_all('a'):
    href = a['href']
    if 'https' not in href:
        href = f'https://pmarchive.com/{href}'
    if '#' in href:
        continue
    if 'html' not in href:
        continue
    urls.append(href)

In [311]:
urls

['https://pmarchive.com/guide_to_startups_part1.html',
 'https://pmarchive.com/guide_to_startups_part2.html',
 'https://pmarchive.com/guide_to_startups_part3.html',
 'https://pmarchive.com/guide_to_startups_part4.html',
 'https://pmarchive.com/guide_to_startups_part5.html',
 'https://pmarchive.com/guide_to_startups_part6.html',
 'https://pmarchive.com/guide_to_startups_part7.html',
 'https://pmarchive.com/guide_to_startups_part8.html',
 'https://pmarchive.com/guide_to_startups_part9.html',
 'https://pmarchive.com/truth_about_vcs_part1.html',
 'https://pmarchive.com/truth_about_vcs_part2.html',
 'https://pmarchive.com/truth_about_vcs_part3.html',
 'https://pmarchive.com/how_to_hire_the_best_people.html',
 'https://pmarchive.com/serial_entrepreneurs_and_todays_silicon_valley.html',
 'https://pmarchive.com/psychology_of_entrepreneurial_misjudgment.html',
 'https://pmarchive.com/age_and_the_entrepreneur.html',
 'https://pmarchive.com/luck_and_the_entrepreneur.html',
 'https://pmarchive.com

In [312]:
def extract_text_from_url(url, min_line_length=5, last_paragraph='This article was written by Marc Andreessen and originally published on his blog, blog.pmarca.com.'):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get text
    lines = []

    _paragraphs = soup.find_all('p')

    for p in _paragraphs:
        text = p.get_text()
        if 'An archive of the best articles from Marc' in text:
            continue
        if 'Maintained by your friends at' in text:
            continue
        if last_paragraph in text:
            break
        lines.append(text.encode('latin-1').decode('utf-8'))
    
    # Remove newlines
    lines = (line.strip() for line in lines if line)
    
    # Remove lines that are too short
    lines = (line for line in lines if len(line) > min_line_length)
    
    # Return text
    return '\n'.join(line for line in lines if line)

In [314]:
print(extract_text_from_url('https://pmarchive.com/guide_to_startups_part1.html'))

In this series of posts I will walk through some of my accumulated knowledge and experience in building high-tech startups.
My specific experience is from three companies I have co-founded: Netscape, sold to America Online in 1998 for $4.2 billion; Opsware (formerly Loudcloud), a public software company with an approximately $1 billion market cap; and now Ning, a new, private consumer Internet company.
But more generally, I’ve been fortunate enough to be involved in and exposed to a broad range of other startups—maybe 40 or 50 in enough detail to know what I’m talking about—since arriving in Silicon Valley in 1994: as a board member, as an angel investor, as an advisor, as a friend of various founders, and as a participant in various venture capital funds.
This series will focus on lessons learned from this entire cross-section of Silicon Valley startups—so don’t think that anything I am talking about is referring to one of my own companies: most likely when I talk about a scenario I h

In [315]:
pages = []

for url in urls:    
    logger.info(f'Url: {url}')
    pages.append({'title': 'NA', 'url': url, 'text': extract_text_from_url(url)})

2023-03-27 13:39:43,127 - Url: https://pmarchive.com/guide_to_startups_part1.html
2023-03-27 13:39:43,247 - Url: https://pmarchive.com/guide_to_startups_part2.html
2023-03-27 13:39:43,520 - Url: https://pmarchive.com/guide_to_startups_part3.html
2023-03-27 13:39:43,856 - Url: https://pmarchive.com/guide_to_startups_part4.html
2023-03-27 13:39:44,073 - Url: https://pmarchive.com/guide_to_startups_part5.html
2023-03-27 13:39:44,299 - Url: https://pmarchive.com/guide_to_startups_part6.html
2023-03-27 13:39:44,622 - Url: https://pmarchive.com/guide_to_startups_part7.html
2023-03-27 13:39:44,904 - Url: https://pmarchive.com/guide_to_startups_part8.html
2023-03-27 13:39:45,301 - Url: https://pmarchive.com/guide_to_startups_part9.html
2023-03-27 13:39:45,511 - Url: https://pmarchive.com/truth_about_vcs_part1.html
2023-03-27 13:39:45,747 - Url: https://pmarchive.com/truth_about_vcs_part2.html
2023-03-27 13:39:46,090 - Url: https://pmarchive.com/truth_about_vcs_part3.html
2023-03-27 13:39:46,42

In [321]:
df = pd.DataFrame(pages)

# # Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df = df[df['text_len'] > 500]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {''}
df = df[~df['url'].isin(excluded_urls)]

In [324]:
df.to_parquet('../data/pmarca.parquet', compression='gzip')

In [325]:
check = pd.read_parquet('../data/pmarca.parquet')
check.shape

(33, 3)

In [326]:
check.head()

Unnamed: 0,title,url,text
0,,https://pmarchive.com/guide_to_startups_part1....,In this series of posts I will walk through so...
1,,https://pmarchive.com/guide_to_startups_part2....,This post is about what to do between when the...
2,,https://pmarchive.com/guide_to_startups_part3....,"In my last post in this series, When the VCs s..."
3,,https://pmarchive.com/guide_to_startups_part4....,This post is all about the only thing that mat...
4,,https://pmarchive.com/guide_to_startups_part5....,"“There she blows,” was sung out from the mast-..."


## Scrape data from http://eugeneyan.com/sitemap.xml

In [2]:
r = requests.get("http://eugeneyan.com/sitemap.xml")
xml = r.text
raw = xmltodict.parse(xml)

In [8]:
def extract_text_from(url, min_line_length=20, last_paragraph='To cite this content, please use:'):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    
    # Find all paragraphs and exclude all paragraphs after the "To cite this content, please use:" paragraph
    _paragraphs = soup.find_all('p')
    
    paragraphs = []
    for p in _paragraphs:
        if'class' in p.attrs and 'date' in p['class']:
            continue
        if p.get_text() == last_paragraph:
            break
        paragraphs.append(p.get_text())
    logger.debug(f'Paragraphs: {paragraphs[0]}')
    
    # Remove consecutive newlines
    lines = (line.strip() for line in paragraphs)
    
    # Remove lines that are less than 10 characters
    lines = (line for line in lines if len(line) > min_line_length)
    
    return '\n'.join(line for line in lines if line)

In [10]:
print(extract_text_from('https://eugeneyan.com/writing/bandits/'))

Recommender systems work well when we have a lot of data on user-item preferences. With a lot of data, we have high certainty about what users like. Conversely, with very little data, we have low certainty. Despite the low certainty, recommenders tend to greedily promote items that received higher engagement in the past. And because they influence how much exposure an item gets, potentially relevant items that aren’t recommended continue getting no to low engagement, perpetuating the feedback loop.
Bandits address this by modeling uncertainty and exploration. By acknowledging the uncertainty in the data and deliberately exploring to reduce it, bandits learn about the relevance of unexplored items.
This is especially applicable when the item set changes quickly, such as for news, ads, and tweets, or when the rate of traffic is low. If new items are constantly added, waiting to collect batch data before retraining the model can be too slow. Bandits are a good fit as they can incrementall

In [11]:
pages = []

for info in raw['urlset']['url']:
    url = info['loc']
    if 'https://eugeneyan.com/writing/' in info['loc']:
        pages.append({'text': extract_text_from(url), 'url': url})

In [23]:
df = pd.DataFrame(pages)

# # Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df[df['text_len'] > 500]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {''}
df = df[~df['url'].isin(excluded_urls)]

In [24]:
df.to_parquet('../data/eugeneyan.parquet', compression='gzip')