In [13]:
import time
import tqdm
import traceback
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [31]:
SUBSTANCES = {
    "MDMA"              : "MDMA",
    "Mushrooms"         : "Mushrooms",
    "LSD"               : "LSD",
    "DMT"               : "DMT",
    "5-MeO-DMT"         : "5MeODMT",
    "Ketamine"          : "Ketamine",
    "Salvia divinorum"  : "Salvia_divinorum",
    "Ibogaine"          : "Ibogaine",
    "Oxycodone"         : "Oxycodone",
    "Methamphetamine"   : "Methamphetamine",
    "Alprazolam"        : "Pharms_Alprazolam",
    "Cocaine"           : "Cocaine",
    # "2C-I"              : "2C-I",
}

def substance_query(substance):
    page = requests.get(f"""https://erowid.org/experiences/subs/exp_{SUBSTANCES[substance]}.shtml""")
    soup = BeautifulSoup(page.text, "html5lib")
    url = "https://erowid.org/" + [anchor["href"] for anchor in soup.find_all("a", href=True) if "exp.cgi?S1=" in anchor["href"]][0]
    return url

def get_substance_reports(substance):
    page = requests.get(substance_query(substance), dict(ShowViews=0, Cellar=0, Start=0, Max=10000)) 
    soup = BeautifulSoup(page.text, "html5lib")
    table = soup.find("table", attrs={"class": "exp-list-table"}).find("tbody")

    experience_IDs = []
    # for row in table.find_all("tr")[:5]:
    for row in table.find_all("tr"):
        if row.find("td", attrs={"class": "exp-substance"}).text.strip().lower() in [key.lower() for key in SUBSTANCES.keys()]:
            experience_IDs.append(row.find("td", attrs={"class": "exp-title"}).find("a", href = True).get("href").split("=")[1])

    return experience_IDs

# get_substance_reports("Salvia divinorum")
# get_substance_reports("DMT")
# get_substance_reports("MDMA")

In [14]:
session = requests.Session()
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=requests.adapters.Retry(total=5, backoff_factor=1)))

def extract_experience_report(experience_ID):
    try:
        response = requests.get("https://erowid.org/experiences/exp.php", params=dict(ID=experience_ID), timeout=10)
        response.raise_for_status()
        page_text = response.text
        # print("found page", page_text)
        soup = BeautifulSoup(page_text, "html5lib")
        # print("got soup", soup)
        substances = sorted(set([element.getText().strip().lower() for element in soup.find_all("td", {"class": "dosechart-substance"})])),
        if len(substances) > 1:
            raise Exception("multiple drugs combined")

        data = dict(
            experience_ID = experience_ID,
            author = soup.find("div", {"class": "author"}).find("a").getText().strip(),
            substance = soup.find("td", {"class": "dosechart-substance"}).getText().strip(),
            content = page_text[page_text.index("<!-- Start Body -->") + len("<!-- Start Body -->"):page_text.index("<!-- End Body -->")].strip(),
            time_of_experience = soup.find("td", {"class": "footdata-expyear"}).getText().split(":")[1].strip(),
            time_of_submission = soup.find("td", {"class": "footdata-pubdate"}).getText().split(":")[1].strip(),
            age_at_experience = soup.find("td", {"class": "footdata-ageofexp"}).getText().split(":")[1].strip(),
            gender = soup.find("td", {"class": "footdata-gender"}).getText().split(":")[1].strip().lower(),
        )

        # print(time.strftime("%H:%M:%S"), experience_ID)
        # print(data["substance"])
        # print(data["content"])
        # print(" ".join(data["content"].split("\n")[:5]))

        return data
    except Exception as e:
        print(f"[ERROR] Experience {experience_ID}")
        print(type(e).__name__)
        # print(traceback.format_exc())

# extract_experience_report(100641)
# extract_experience_report(107156)
# extract_experience_report(57297)
# extract_experience_report(17787)
# extract_experience_report(103491)
# extract_experience_report(373)
# extract_experience_report(52011)
# extract_experience_report(58802)

In [32]:
# reports = [extract_experience_report(experience_ID) for substance in SUBSTANCES for experience_ID in tqdm.tqdm(get_substance_reports(substance), desc = substance)]

# df = pd.DataFrame()
# df = pd.read_parquet("database.parquet")
# for substance in SUBSTANCES:
for substance in reversed(SUBSTANCES.keys()):
    experience_IDs = get_substance_reports(substance)
    reports = []
    for idx, experience_ID in enumerate(experience_IDs, start = 1):
        print(f"""[{time.strftime("%H:%M:%S")}] {experience_ID.rjust(6)} ({idx}/{len(experience_IDs)} {substance})""")
        if experience_ID in df["experience_ID"].values:
            continue
        experience_report = extract_experience_report(experience_ID) 
        if experience_report:
            reports.append(experience_report)
        time.sleep(1)
    substance_reports = pd.DataFrame.from_records(reports)
    df = pd.concat([df, substance_reports], ignore_index=True)

[09:58:55]  93801 (1/442 Cocaine)
[09:58:58]  43976 (2/442 Cocaine)
[09:59:10]   1643 (3/442 Cocaine)
[09:59:17]  72339 (4/442 Cocaine)
[09:59:26]  66181 (5/442 Cocaine)
[09:59:31]  92241 (6/442 Cocaine)
[09:59:39]  68053 (7/442 Cocaine)
[09:59:45]  51994 (8/442 Cocaine)
[ERROR] Experience 51994
ConnectionError
[10:00:00]  53211 (9/442 Cocaine)
[ERROR] Experience 53211
ConnectionError
[10:00:08]  37703 (10/442 Cocaine)
[10:00:17]  49482 (11/442 Cocaine)
[10:00:29]  24253 (12/442 Cocaine)
[10:00:42]  21524 (13/442 Cocaine)
[10:00:54]  16525 (14/442 Cocaine)
[10:01:04]  19495 (15/442 Cocaine)
[10:01:15]  26166 (16/442 Cocaine)
[ERROR] Experience 26166
ConnectionError
[10:01:30]  20909 (17/442 Cocaine)
[ERROR] Experience 20909
ReadTimeout
[10:01:43]  17790 (18/442 Cocaine)
[10:01:56]    269 (19/442 Cocaine)
[10:02:02]   1017 (20/442 Cocaine)
[ERROR] Experience 1017
AttributeError
[10:02:11]   2359 (21/442 Cocaine)
[ERROR] Experience 2359
ReadTimeout
[10:02:29]  85460 (22/442 Cocaine)
[ERR

KeyboardInterrupt: 

In [39]:
# df = pd.read_parquet("database.parquet")
df["substance"].value_counts()

LSD                                            1177
Psilocybin                                     1103
DMT                                             630
Salvinorin A                                    609
MDMA                                            569
Cocaine                                         388
Methamphetamine                                 367
Ketamine                                        344
5-MeO-DMT                                       257
Oxycodone                                       155
Cannabis                                         75
Pharms - Alprazolam                              38
Ibogaine                                         29
Alcohol - Beer/Wine                              20
Alcohol                                          16
Alcohol - Hard                                    8
Amphetamines                                      7
Tobacco                                           4
2-Me-DMT                                          3
Pharms - Clo

In [41]:
database = df.drop_duplicates(subset = "experience_ID")
database = database[database["substance"].isin(SUBSTANCES.keys())]
# database = database[database["substance"].isin(["Psilocybin", "Salvinorin A"] + list(SUBSTANCES.keys()))]

# database = pd.read_parquet("database.parquet")
database["substance"] = database["substance"].replace({
    "Mushrooms"         : "Psilocybin",
    "Salvia divinorum"  : "Salvinorin A",
})
database["gender"] = database["gender"].str.lower()
database["gender"] = database["gender"].str.strip()

database.to_parquet("database.parquet")
database["substance"].value_counts()

LSD                1177
Psilocybin         1103
DMT                 630
Salvinorin A        609
MDMA                569
Cocaine             388
Methamphetamine     367
Ketamine            344
5-MeO-DMT           257
Oxycodone           155
Ibogaine             29
Name: substance, dtype: int64