In [4]:
import time
import json
from collections import defaultdict

from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [None]:
import os
import environ

env = environ.Env()
env.read_env(env.str('ENV_PATH', '.env'))

In [None]:
mongo_cli_username = os.environ.get('MONGO_CLI_USERNAME')
mongo_cli_password = os.environ.get('MONGO_CLI_PASSWORD')

In [5]:
client = MongoClient("mongodb+srv://{}:{}@cluster0.plop5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority".format(mongo_cli_username, mongo_cli_password))
db = client['healdash']

In [26]:
keyword_list = []
for x in db.mesh_synonyms.find({},{ "synonyms.word": 1 }):
    for synonym in x['synonyms']:
        keyword_list.append(synonym['word'])

In [27]:
len(keyword_list)

876

In [7]:
# limit links after keyword selection
# limit posts in a link
limit_links, limit_posts = 5, 10

In [8]:
class PLM:
    def __init__(self, username, password, limit_links = 50, limit_posts = 50):
        self.login_url = "https://www.patientslikeme.com/users/sign_in"
        self.forum_url = "https://www.patientslikeme.com/forum/plm/topics"
        self.stem_url = "https://www.patientslikeme.com/"
        self.username = username
        self.password = password
        self.keywords = []
        self.recent_keyword = ""
        self.relevancy_limit_links = limit_links
        self.relevancy_limit_posts = limit_posts
        self.keyword_dict = defaultdict(list)
        self.link_list = []
        self.current_url = ""
        
        # init the browser
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')

        driver = webdriver.Chrome() # initialize the driver
        driver.get(self.login_url) # go to the login url
        self.driver = driver
        self.login()
        
    def login(self):
        time.sleep(1) # small delay before getting the page source
        username = self.driver.find_element_by_id("user_email_or_login")
        password = self.driver.find_element_by_id("user_password")
        sign_in_button = self.driver.find_element_by_class_name("button-primary-action")
        username.send_keys(self.username)
        password.send_keys(self.password)  # enter the password
        sign_in_button.click()
        
    def keyword_exists(self) -> bool:
        all_links = self.page_source.find_all('a', {"class": "header-3"}) # get all links
        if len(all_links):
            return True
        else:
            return False
        
    def search_on_forum(self, keyword):
        self.recent_keyword = keyword
        time.sleep(0.5) # small delay before getting the page source
        self.driver.get(self.forum_url) # go to the forum url
        search_input = self.driver.find_elements_by_tag_name("input")[2]
        search_input.send_keys(keyword)
        search_input.submit()
        
    def get_link_list(self):
        all_links = self.page_source.find_all('a', {"class": "header-3"}) # get all links
        
        for link in all_links[:self.relevancy_limit_links]:
            self.link_list.append(self.stem_url + link["href"])
            
        if self.max_pages_lists > 1 and len(all_links) < self.relevancy_limit_links:
            for i in range(2, self.max_pages_lists + 1):
                link = "https://www.patientslikeme.com/forum/plm/topics/search?page={}&search%5Btext%5D={}".format(i ,self.recent_keyword)
                self.driver.get(link) 
                self.compile_page_source()
                all_links = self.page_source.find_all('a', {"class": "header-3"}) # get all links
                for link in all_links:
                    self.link_list.append(self.stem_url + link["href"])
        
    def compile_page_source(self) -> object:
        soup = BeautifulSoup(self.driver.page_source.encode('utf-8','ignore')) # compile it with bs4
        try:
            self.max_pages_lists = int(soup.find('div', {"class": "pagination"})('a')[-2].text)
        except:
            self.max_pages_lists = 1
        self.page_source = soup
        return self
    
    def clean_entry(self, entry):
        return (
            entry
            .replace("\n", "") # remove new lines
            .replace("\'", "'") # fix apostrophe
            .replace("\xa0", "")
            .strip() # remove spaces
        )
    
    def scrape_link_entries(self, link):
        self.driver.get(link)
        self.current_url = self.driver.current_url
        self.page_source = BeautifulSoup(self.driver.page_source.encode('utf-8','ignore')) #update page source
        
        try:
            self.max_pages_posts = int(self.page_source.find('div', {"class": "pagination"})('a')[-2].text)
        except:
            self.max_pages_posts = 1
            
        all_entries = self.page_source.find_all('div', {"class": "js-no-observer"})
        all_usernames = self.page_source.find_all('a', {"class": "username"})

        for entry, username in zip(all_entries[:self.relevancy_limit_posts], all_usernames[:self.relevancy_limit_posts]):
            self.keyword_dict[self.recent_keyword].append({
                "username": username.text, 
                "entry": self.clean_entry(entry.text)
            })
            
        if self.max_pages_posts > 1 and len(all_entries) < self.relevancy_limit_posts:
            for i in range(2, self.max_pages_posts + 1):
                link = self.current_url + "&page={}".format(i)
                self.driver.get(link) 
                self.compile_page_source()
                   
                all_entries = self.page_source.find_all('div', {"class": "js-no-observer"})
                all_usernames = self.page_source.find_all('a', {"class": "username"})

                for entry, username in zip(all_entries, all_usernames):
                    self.keyword_dict[self.recent_keyword].append({
                        "username": username.text, 
                        "entry": self.clean_entry(entry.text)
                    })
        
    def scrape_keywords(self, keywords):
        self.keywords = keywords
        
        for keyword in keywords:
            self.search_on_forum(keyword)
            self.compile_page_source()
            self.get_link_list()

            for link in self.link_list:
                self.scrape_link_entries(link)
                
            # add data to mongodb
            db.plm_entries.update_many({"keyword": keyword}, {"$set": {"objects": self.keyword_dict[keyword]}}, upsert=True)

In [9]:
plm = PLM("drgoktugasci@gmail.com", mongo_cli_password, limit_links, limit_posts)

In [10]:
plm.scrape_keywords(keywords)

IndexError: list index out of range

In [9]:
plm.keyword_dict

defaultdict(list,
            {'gastritis': [{'username': 'Rachel96', 'entry': ''},
              {'username': 'JeanineBrennan',
               'entry': 'Hi @Racchel96,Here is our condition page for Gastritis:www.patientslikeme.com/.../overviewYou can also see treatments on this page and symptoms.I hope this helps! Have you been diagnosed with Gastritis?Warm Regards,Jeanine, from the Community Team'},
              {'username': 'hello1965',
               'entry': 'Ten years ago I had to take the antibiotic Clarithromycin and it had a severe reaction leaving be with gastritis and ulcers, I live with the consequences of this today. The reason I am writing this is that I have to have an operation in the coming weeks and have been told that I will have to have an antibiotic drip, which is fine, but I will have to take antibiotics afterwards for some considerable time. My question is this? What foods or supplements would be best to take the antibiotics with? I assume there is no such thing