In [21]:
import time
import tqdm
import traceback
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
SUBSTANCES = {
    "MDMA"              : "MDMA",
    "Mushrooms"         : "Mushrooms",
    "LSD"               : "LSD",
    "DMT"               : "DMT",
    "5-MeO-DMT"         : "5MeODMT",
    "Ketamine"          : "Ketamine",
    "Salvia divinorum"  : "Salvia_divinorum",
    "Ibogaine"          : "Ibogaine",
}

def substance_query(substance):
    page = requests.get(f"""https://erowid.org/experiences/subs/exp_{SUBSTANCES[substance]}.shtml""")
    soup = BeautifulSoup(page.text, "html5lib")
    url = "https://erowid.org/" + [anchor["href"] for anchor in soup.find_all("a", href=True) if "exp.cgi?S1=" in anchor["href"]][0]
    return url

def get_substance_reports(substance):
    page = requests.get(substance_query(substance), dict(ShowViews=0, Cellar=0, Start=0, Max=10000)) 
    soup = BeautifulSoup(page.text, "html5lib")
    table = soup.find("table", attrs={"class": "exp-list-table"}).find("tbody")

    experience_IDs = []
    # for row in table.find_all("tr")[:5]:
    for row in table.find_all("tr"):
        if row.find("td", attrs={"class": "exp-substance"}).text.strip().lower() in [key.lower() for key in SUBSTANCES.keys()]:
            experience_IDs.append(row.find("td", attrs={"class": "exp-title"}).find("a", href = True).get("href").split("=")[1])

    return experience_IDs

# get_substance_reports("Salvia divinorum")
# get_substance_reports("DMT")
# get_substance_reports("MDMA")

In [56]:
session = requests.Session()
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=requests.adapters.Retry(total=5, backoff_factor=1)))

def extract_experience_report(experience_ID):
    try:
        response = requests.get("https://erowid.org/experiences/exp.php", params=dict(ID=experience_ID), timeout=10)
        response.raise_for_status()
        page_text = response.text
        # print("found page", page_text)
        soup = BeautifulSoup(page_text, "html5lib")
        # print("got soup", soup)
        substances = sorted(set([element.getText().strip().lower() for element in soup.find_all("td", {"class": "dosechart-substance"})])),
        if len(substances) > 1:
            raise Exception("multiple drugs combined")

        data = dict(
            experience_ID = experience_ID,
            author = soup.find("div", {"class": "author"}).find("a").getText().strip(),
            substance = soup.find("td", {"class": "dosechart-substance"}).getText().strip(),
            content = page_text[page_text.index("<!-- Start Body -->") + len("<!-- Start Body -->"):page_text.index("<!-- End Body -->")].strip(),
            time_of_experience = soup.find("td", {"class": "footdata-expyear"}).getText().split(":")[1].strip(),
            time_of_submission = soup.find("td", {"class": "footdata-pubdate"}).getText().split(":")[1].strip(),
            age_at_experience = soup.find("td", {"class": "footdata-ageofexp"}).getText().split(":")[1].strip(),
            gender = soup.find("td", {"class": "footdata-gender"}).getText().split(":")[1].strip().lower(),
        )

        # print(time.strftime("%H:%M:%S"), experience_ID)
        # print(data["substance"])
        # print(data["content"])
        # print(" ".join(data["content"].split("\n")[:5]))

        return data
    except Exception as e:
        print(f"[ERROR] Experience {experience_ID}")
        print(type(e).__name__)
        # print(traceback.format_exc())

# extract_experience_report(100641)
# extract_experience_report(107156)
# extract_experience_report(57297)
# extract_experience_report(17787)
# extract_experience_report(103491)
# extract_experience_report(373)
# extract_experience_report(52011)
# extract_experience_report(58802)

In [58]:
# reports = [extract_experience_report(experience_ID) for substance in SUBSTANCES for experience_ID in tqdm.tqdm(get_substance_reports(substance), desc = substance)]

# df = pd.DataFrame()
for substance in SUBSTANCES:
    experience_IDs = get_substance_reports(substance)
    reports = []
    for idx, experience_ID in enumerate(experience_IDs, start = 1):
        print(f"""[{time.strftime("%H:%M:%S")}] {experience_ID.rjust(6)} ({idx}/{len(experience_IDs)} {substance})""")
        if experience_ID in df["experience_ID"].values:
            continue
        experience_report = extract_experience_report(experience_ID) 
        if experience_report:
            reports.append(experience_report)
        time.sleep(1)
    substance_reports = pd.DataFrame.from_records(reports)
    df = pd.concat([df, substance_reports], ignore_index=True)

[18:56:35] 109031 (1/633 MDMA)
[18:56:35]  60495 (2/633 MDMA)
[ERROR] Experience 60495
AttributeError
[18:56:39] 117799 (3/633 MDMA)
[18:56:39] 115298 (4/633 MDMA)
[18:56:39] 112960 (5/633 MDMA)
[18:56:39] 109469 (6/633 MDMA)
[18:56:39] 106113 (7/633 MDMA)
[18:56:39]  22590 (8/633 MDMA)
[18:56:39]  26702 (9/633 MDMA)
[18:56:39]  22864 (10/633 MDMA)
[18:56:39]  18564 (11/633 MDMA)
[18:56:39]   9913 (12/633 MDMA)
[ERROR] Experience 9913
AttributeError
[18:56:43]   8732 (13/633 MDMA)
[18:56:43]   8424 (14/633 MDMA)
[18:56:43]   4897 (15/633 MDMA)
[18:56:43]   2019 (16/633 MDMA)
[18:56:43] 116889 (17/633 MDMA)
[18:56:43] 115280 (18/633 MDMA)
[18:56:43]  92745 (19/633 MDMA)
[18:56:43] 103197 (20/633 MDMA)
[18:56:43] 100641 (21/633 MDMA)
[18:56:43]  96315 (22/633 MDMA)
[18:56:43]  85912 (23/633 MDMA)
[18:56:43] 108086 (24/633 MDMA)
[18:56:43] 112022 (25/633 MDMA)
[18:56:43] 111464 (26/633 MDMA)
[18:56:43] 110554 (27/633 MDMA)
[18:56:43]  71723 (28/633 MDMA)
[18:56:43]  84081 (29/633 MDMA)
[1

In [15]:
pd.read_parquet("DMT.parquet")

In [14]:
# df.to_parquet("dat.parquet")
# df = pd.read_parquet("dat.parquet")
# df = pd.concat([pd.read_parquet(f"{substance}.parquet") for substance in SUBSTANCES.values()])
# df
# df["substance"].value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '5MeODMT.parquet'

In [24]:
# database = df.drop_duplicates(subset = "experience_ID")
# database = database[database["substance"].isin(SUBSTANCES.keys())]

database = pd.read_parquet("database.parquet.bak")
database["substance"] = database["substance"].replace({
    "Mushrooms"         : "Psilocybin",
    "Salvia divinorum"  : "Salvia A",
})
database["gender"] = database["gender"].str.lower()
database["gender"] = database["gender"].str.strip()

database.to_parquet("database.parquet")
database["substance"].value_counts()

LSD           1177
Psilocybin    1103
DMT            630
Salvia A       609
MDMA           569
Ketamine       344
5-MeO-DMT      257
Ibogaine        29
Name: substance, dtype: int64