In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Pre-process the scraped data

## Reading

In [2]:
df = pd.read_csv("scraped_newsletters.csv")

In [18]:
df.drop(columns=["file_path", "text"]).to_csv("scraped_newsletters.csv", sep=",", index=False)

In [3]:
df.head()

Unnamed: 0,from,date,subject,content
0,tldr crypto ⚡ <dan@tldrnewsletter.com>,2022-05-11 08:05:41+00:00,"Luna 33% drop in hour 🌕, US losing crypto domi...","<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,info@odsc.com,2022-05-11 04:02:37-04:00,ODSC Europe: Check Out Our New Sessions and 40...,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
2,promotions@technologyreview.com,2022-05-10 19:03:29+00:00,Creativity for distributed teams,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
3,message@knivesandtools.com,2022-05-10 17:16:07+02:00,"Letzte Chance, unsere Angebote zu nutzen!","<!DOCTYPE html>\n<html lang=en xmlns=""http://w..."
4,list@ben-evans.com,2022-05-10 15:06:59+00:00,Benedict's Newsletter: No. 438,"<!doctype html>\n<html xmlns=""http://www.w3.or..."


## Cleaning and filtering the sender

In [4]:
tldr_newsletter_df = df[(df["from"].str.contains("tldr")) & (~df["from"].str.contains("crypto"))].reset_index(drop=True)
tldr_newsletter_df["newsletter"] = "tldr"

In [5]:
datascienceweekly_newsletter_df = df[(df["from"].str.contains("datascienceweekly")) & (df["subject"].str.contains("Issue"))].reset_index(drop=True)
datascienceweekly_newsletter_df["newsletter"] = "datascienceweekly"

In [6]:
box_of_amazing_newsletter_df = df[(df['date'] > '2022-04-01') & (df["from"] == "rahim@rahimhirji.com")].reset_index(drop=True)
box_of_amazing_newsletter_df["newsletter"] = "box of amazing"

## Save the original HTMLs for later lookup

In [9]:
datascienceweekly_newsletter_df[["newsletter","date","content"]].to_csv("datascienceweekly_lookup.csv", index=False, quoting=1)

## Processing the HTMLs

### DataScienceWeekly

In [None]:
example = datascienceweekly_newsletter_df.sample(1)

In [53]:
def get_split_stories_datascienceweekly(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    all_uls = data.find_all("ul")
    processed_data = []

    for ul in all_uls:
        # Necessary to the the previous tag and not filler, see https://www.crummy.com/software/BeautifulSoup/bs4/doc/#next-sibling-and-previous-sibling
        previous = ul.previous_sibling.previous_sibling

        # Get the topic header
        if((previous.name == "h2") and ("#34495e" in previous["style"])):
            current_topic = previous.text.strip()

        # Deprecated HTML tag, but they use it anyway
        font = ul.find("font")
        try:
            children = list(font.children)
            headline = children[0].text.strip()
            body = " ".join([x.text.strip() for x in children[2:] if "<br" not in x])

            # TODO maybe add the full html here, or a link to the full html
            processed_data.append({
                "newsletter" : "datascienceweekly",
                "date" : date,
                "topic" : current_topic,
                "headline" : headline,
                "body" : body
            })

        except Exception as e:
            print(e)
    
    return processed_data

In [56]:
processed_datascienceweekly_df = pd.DataFrame(get_split_stories_datascienceweekly(example["content"].item(), example["date"].item()))

In [58]:
processed_datascienceweekly_df.head()

Unnamed: 0,newsletter,date,topic,headline,body
0,datascienceweekly,2022-03-17 22:52:02+00:00,Editor Picks,A Deep Dive into NLP Tokenization and Encoding...,"This is a deep dive: over 8,000 words long. Do..."
1,datascienceweekly,2022-03-17 22:52:02+00:00,Editor Picks,Making Deep Learning Go Brrrr From First Princ...,"So, you want to improve the performance of you..."
2,datascienceweekly,2022-03-17 22:52:02+00:00,Editor Picks,Announcing the 2022 AI Index Report,The AI Index is an independent initiative at t...
3,datascienceweekly,2022-03-17 22:52:02+00:00,Data Science Articles & Videos,The “0 / 1 / Done” Strategy for Data Science,To achieve operational excellence in applied d...
4,datascienceweekly,2022-03-17 22:52:02+00:00,Data Science Articles & Videos,Data salaries at FAANG companies in 2022,What 4000 data points can tell us about the st...


In [62]:
temp_df = datascienceweekly_newsletter_df.apply(lambda x: get_split_stories_datascienceweekly(x["content"], x["date"]), axis=1)

In [71]:
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [81]:
final_df.to_csv("datascienceweekly_stories.csv", index=False, quoting=1)

In [80]:
final_df.sample(5)

Unnamed: 0,newsletter,date,topic,headline,body
152,datascienceweekly,2022-03-11 00:04:20+00:00,Training & Resources,Google AI for JavaScript developers with Tenso...,Get productive with TensorFlow.js - Google's M...
14,datascienceweekly,2022-04-28 23:21:02+00:00,Data Science Articles & Videos,"Smitha Shyam, Director of Engineering at Uber:",
101,datascienceweekly,2022-03-25 02:50:51+00:00,Data Science Articles & Videos,"How a Kalman filter works, in pictures","I have to tell you about the Kalman filter, be..."
47,datascienceweekly,2022-04-14 18:42:04+00:00,Data Science Articles & Videos,Improving Code Reviews with Github’s Copilot,"In this episode, I talk to Paige Bailey, the d..."
41,datascienceweekly,2022-04-14 18:42:04+00:00,Editor Picks,Playing with DALL·E 2,I got access to Dall·E 2 yesterday. Here are s...


### TLDR

### Box of Amazing

## Put it all together

In [10]:
html_path = "HTMLs\datascienceweekly.html"
with open(html_path, "r") as f:
    data = BeautifulSoup(f, "html.parser")

In [6]:
pd.DataFrame([("test", 1),("test", 2),("haha", 3)])

Unnamed: 0,0,1
0,test,1
1,test,2
2,haha,3


In [11]:
all_uls = data.find_all("ul")
processed_data = []

for ul in all_uls:
    previous = ul.previous_sibling.previous_sibling
    if((previous.name == "h2") and ("#34495e" in previous["style"])):
        current_topic = previous.text.strip()

    font = ul.find("font")
    try:
        children = list(font.children)
        
        # print(list(font.children))
        # print(list(font.descendants))
        headline = children[0].text
        
        body = " ".join([x.text for x in children[2:] if "<br" not in x])

        processed_data.append({
            "newsletter_name":"datascienceweekly",
            "date" : "",
            "topic" : current_topic,
            "headline" : headline,
            "body" : body.strip()
        })
        #body = list(font.descendants)[-1]
        # print(headline,"\n--------------------------\n", body)
    except Exception as e:
        # children = list(font.children)
        # print([x for x in children[2:] if "<br" not in x])
        print("ERROR++++++++++++",e)

In [12]:
pd.DataFrame(processed_data)

Unnamed: 0,topic,headline,body
0,Editor Picks,Deep Learning Is Hitting a Wall,What would it take for artificial intelligence...
1,Editor Picks,MLOps Is a Mess But That's to be\n ...,Today machine learning continues to be one of ...
2,Editor Picks,Anatomy of an AI System,The Amazon Echo as an anatomical map of human\...
3,Data Science Articles & Videos,µTransfer: A technique for hyperparameter\n ...,"In this post, we relay how our fundamental\n ..."
4,Data Science Articles & Videos,Restoring and attributing ancient texts using\...,Ancient history relies on disciplines such as\...
5,Data Science Articles & Videos,The 2030 Self-Driving Car Bet,It's my honor to announce that John Carmack\n ...
6,Data Science Articles & Videos,Data Visualization Standards,The Data Visualization Standards (DVS) are a\n...
7,Data Science Articles & Videos,NN Template - Generic template to bootstrap\n ...,Generic cookiecutter template to bootstrap PyT...
8,Data Science Articles & Videos,A visual introduction to machine learning,"n machine learning, computers apply statistica..."
9,Data Science Articles & Videos,A Concrete Introduction to Probability (using\...,This notebook will explore these concepts in a...
