# Scrape the Malmi Emails

In [26]:
import re
import requests
import json

from pathlib import Path
from bs4 import BeautifulSoup

## Step 1: Bare Extraction

In [9]:
MALMI_URL = "https://mmalmi.github.io/satoshi/"

malmi_html = BeautifulSoup(
    requests.get(MALMI_URL).text
)

In [10]:
meta_fields = {
    "date": re.compile("Date: (.+)"),
    "subject": re.compile("Subject: (.+)"), 
    "from": re.compile("From: (.+)"),
    "to": re.compile("To: (.+)"),
}


def try_match_metadata(div, re_pattern):
    """Try to match with regex pattern."""
    match = re_pattern.match(div.text)
    if match:
        return match.group(1)

def extract_header_metadata(email_div):
    """Extract metadata from div header."""
    metadata = {}
    for div in email_div.find_all('div', class_='header')[0].find_all('div'):
        for field, matcher in meta_fields.items():
            if (result := try_match_metadata(div, matcher)):
                metadata[field] = result
    return metadata


def extract_email_data(parsed_html):
    """Extract Satoshi email data from raw HTML"""
    satoshi_emails = {}
    for email_div in parsed_html.find_all('div', class_='message satoshi'):
        email_data = {}
        email_data["metadata"] = extract_header_metadata(email_div)
        email_data["text"] = email_div.find_all("div", class_="body")[0].text.strip()
        email_index = email_div.find_all("div", class_="email-index")[0].text.strip()
        satoshi_emails[email_index] = email_data
    return satoshi_emails

In [11]:
satoshi_emails = extract_email_data(malmi_html)

len(satoshi_emails)

144

In [12]:
satoshi_emails["Email #1"]

{'metadata': {'date': 'Sat, 02 May 2009 18:06:58 +0100',
  'from': 'Satoshi Nakamoto <satoshin@gmx.com>',
  'subject': 'Re: Bitcoin',
  'to': 'Martti Malmi <sirius-m@users.sourceforge.net>'},
 'text': 'Thanks for starting that topic on ASC, your understanding of bitcoin is \nspot on.  Some of their responses were rather Neanderthal, although I \nguess they\'re so used to being anti-fiat-money that anything short of \ngold isn\'t good enough.  They concede that something is flammable, but \nargue that it\'ll never burn because there\'ll never be a spark.  Once \nit\'s backed with cash, that might change, but I\'d probably better \nrefrain from mentioning that in public anymore until we\'re closer to \nready to start.  I think we\'ll get flooded with newbies and we need to \nget ready first.\n\nWhat we need most right now is website writing.  My writing is not that \ngreat, I\'m a much better coder.  Maybe you could create the website on \nsourceforge, which is currently blank.  If you c

## Step 2: Cleaning

The email strings contain reply content from Malmi. We want to isolate the text parts that are unique to Satoshi. This must be done manually.

We export a plain text sheet for doing this cleaning work.

In [172]:
review_lines = []

for email, email_data in satoshi_emails.items():
    email_text = email_data["text"]
    review_lines.append(f"%%%%%\nID={email}\n\n{email_text}\n%%%%%")

    
with open("manual_cleanup.txt", "w") as outfile:
    outfile.write("\n\n".join(review_lines))

In [3]:
email_re = re.compile(r"%%%%%\nID=(.+?)\n\n(.+?)\n%%%%%", re.DOTALL)

In [5]:
completed_cleanup_text = Path("manual_cleanup_DONE.txt").read_text()

cleaned_emails = email_re.findall(completed_cleanup_text)

In [6]:
len(cleaned_emails)

144

In [7]:
cleaned_emails[0]

('Email #1',
 "Thanks for starting that topic on ASC, your understanding of bitcoin is \nspot on.  Some of their responses were rather Neanderthal, although I \nguess they're so used to being anti-fiat-money that anything short of \ngold isn't good enough.  They concede that something is flammable, but \nargue that it'll never burn because there'll never be a spark.  Once \nit's backed with cash, that might change, but I'd probably better \nrefrain from mentioning that in public anymore until we're closer to \nready to start.  I think we'll get flooded with newbies and we need to \nget ready first.\n\nWhat we need most right now is website writing.  My writing is not that \ngreat, I'm a much better coder.  Maybe you could create the website on \nsourceforge, which is currently blank.  If you can write a FAQ, I can \ngive you a compilation of my replies to questions in e-mail and forums \nfor facts and details and ideas.\n\nCodewise, there's not much that's easy right now.  One thing th

In [50]:
final_email_data = []
for id_, cleaned_text in cleaned_emails:
    metadata = {"emailHeader": satoshi_emails[id_]["metadata"].copy()}
    metadata["source"] = "https://mmalmi.github.io/satoshi/"
    metadata["emailIndex"] = id_
    final_email_data.append(
        {
            "author": "Satoshi Nakamoto",
            "metadata": metadata,
            "authorText": cleaned_text.strip(),
        }
    )

In [51]:
final_email_data[5]

{'author': 'Satoshi Nakamoto',
 'metadata': {'emailHeader': {'date': 'Sun, 24 May 2009 23:03:38 +0100',
   'from': 'Satoshi Nakamoto <satoshin@gmx.com>',
   'subject': 'Re: Bitcoin',
   'to': 'mmalmi@cc.hut.fi'},
  'source': 'https://mmalmi.github.io/satoshi/',
  'emailIndex': 'Email #15'},
 'authorText': "You're right, that was it.  I went in and granted us access using the \nalternate account.\n\nI like your idea of at least moving the FAQ into the wiki.  I've seen \nother projects that use the wiki for the FAQ or even the whole site.  If \nyou can figure out how to make it so regular users can edit things, then \nanyone who wants to can help."}

In [52]:
with open("malmi_emails.json", "w") as outfile:
    json.dump(final_email_data, outfile)