In [None]:
import time
import tqdm
import traceback
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [None]:
SUBSTANCES = {
    "MDMA"              : "MDMA",
    "Ketamine"          : "Ketamine",
    "LSD"               : "LSD",
    "Mushrooms"         : "Mushrooms",
    "DMT"               : "DMT",
    # "5-MeO-DMT"         : "5MeODMT",
    # "Mescaline"         : "Mescaline",
    # "Salvia divinorum"  : "Salvia_divinorum",
    # "Oxycodone"         : "Oxycodone",
    # "Alcohol"           : "Alcohol",
    # "Cocaine"           : "Cocaine",
    # "Methamphetamine"   : "Methamphetamine",
    # "Cannabis"          : "Cannabis",
}

def substance_query(substance):
    page = requests.get(f"""https://erowid.org/experiences/subs/exp_{SUBSTANCES[substance]}.shtml""")
    soup = BeautifulSoup(page.text, "html5lib")
    url = "https://erowid.org/" + [anchor["href"] for anchor in soup.find_all("a", href=True) if "exp.cgi?S1=" in anchor["href"]][0]
    return url

def get_substance_reports(substance):
    page = requests.get(substance_query(substance), dict(ShowViews=0, Cellar=0, Start=0, Max=10000)) 
    soup = BeautifulSoup(page.text, "html5lib")
    table = soup.find("table", attrs={"class": "exp-list-table"}).find("tbody")

    experience_IDs = []
    # for row in table.find_all("tr")[:5]:
    for row in table.find_all("tr"):
        if row.find("td", attrs={"class": "exp-substance"}).text.strip().lower() in [key.lower() for key in SUBSTANCES.keys()]:
            experience_IDs.append(row.find("td", attrs={"class": "exp-title"}).find("a", href = True).get("href").split("=")[1])

    return experience_IDs

In [None]:
session = requests.Session()
session.mount(
    "http://", 
    requests.adapters.HTTPAdapter(max_retries=requests.adapters.Retry(
        total=5, 
        backoff_factor=1
    ))
)

def extract_experience_report(experience_ID, verbose = False):
    try:
        response = requests.get("https://erowid.org/experiences/exp.php", params=dict(ID=experience_ID), timeout=10)
        response.raise_for_status()
        page_text = response.text
        if verbose:
            print("found page", page_text)
        soup = BeautifulSoup(page_text, "html5lib")
        if verbose:
            print("got soup", soup)
        substances = sorted(set([element.getText().strip().lower() for element in soup.find_all("td", {"class": "dosechart-substance"})])),
        if len(substances) > 1:
            raise Exception("multiple drugs combined")

        data = dict(
            experience_ID = experience_ID,
            author = soup.find("div", {"class": "author"}).find("a").getText().strip(),
            substance = soup.find("td", {"class": "dosechart-substance"}).getText().strip(),
            content = page_text[page_text.index("<!-- Start Body -->") + len("<!-- Start Body -->"):page_text.index("<!-- End Body -->")].strip(),
            time_of_experience = soup.find("td", {"class": "footdata-expyear"}).getText().split(":")[1].strip(),
            time_of_submission = soup.find("td", {"class": "footdata-pubdate"}).getText().split(":")[1].strip(),
            age_at_experience = soup.find("td", {"class": "footdata-ageofexp"}).getText().split(":")[1].strip(),
            gender = soup.find("td", {"class": "footdata-gender"}).getText().split(":")[1].strip().lower(),
        )

        if verbose:
            print(time.strftime("%H:%M:%S"), experience_ID)
            print(data["substance"])
            print(data["content"])
            print(" ".join(data["content"].split("\n")[:5]))

        return data
    except Exception as e:
        print(f"[ERROR] Experience {experience_ID}")
        print(type(e).__name__)
        # print(traceback.format_exc())

# reports = [extract_experience_report(experience_ID) for substance in SUBSTANCES for experience_ID in tqdm.tqdm(get_substance_reports(substance), desc = substance)]

In [None]:
df = pd.DataFrame()
# df = pd.read_parquet("data/database.parquet")
for substance in reversed(SUBSTANCES.keys()):
    experience_IDs = get_substance_reports(substance)
    reports = []
    for idx, experience_ID in enumerate(experience_IDs, start = 1):
        print(f"""[{time.strftime("%H:%M:%S")}] {experience_ID.rjust(6)} ({idx}/{len(experience_IDs)} {substance})""")
        if experience_ID in df["experience_ID"].values:
            continue
        experience_report = extract_experience_report(experience_ID) 
        if experience_report:
            reports.append(experience_report)
        time.sleep(1)
    substance_reports = pd.DataFrame.from_records(reports)
    df = pd.concat([df, substance_reports], ignore_index=True)

In [None]:
# df = pd.read_parquet("data/database.parquet")
df["substance"].value_counts(dropna = False)

In [None]:
# database = df.drop_duplicates(subset = "experience_ID")

database = pd.read_parquet("data/database.parquet")
remapping = {
    "Mushrooms"         : "Psilocybin",
    # "Salvia divinorum"  : "Salvinorin A",
    # "Cannabis"          : "THC",
}
database["substance"] = database["substance"].replace(remapping)
database["gender"] = database["gender"].str.lower()
database["gender"] = database["gender"].str.strip()
database = database[database["substance"].isin(list(SUBSTANCES.keys()) + list(remapping.values()))]

database.to_parquet("data/database.parquet")
print(f"""{len(database["substance"])} experience reports""")
database["substance"].value_counts(dropna = False)