In [1]:
import os
import re
import json

import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess



In [1]:
webmd_url = "https://www.webmd.com/"

class ReviewsSpider(scrapy.Spider):
    os.chdir("/Users/gogrean/Documents/Insight_Fellowship/Research/Mental_Health/NHANES_Survey/data")
    name = "reviews"
    start_urls = [
        webmd_url,
    ]
    
    def parse(self, response):
        meds_df = pd.read_csv("brand_to_generic_drug_names.csv")
        meds = [m.lower() for m in list(meds_df["Brand Name"].values)]
        for med in meds:
            list_link = response.urljoin("/drugs/2/alpha/" + med[0] + "/" + med[0:2])
            yield scrapy.Request(list_link, callback=self.parse_list, meta = {'drug' : med})
    
    def parse_list(self, response):
        med = response.meta['drug']
        css_selector = 'div[class="drug-list-container"] a[href*="/' + med + '/"]'
        med_details_link = response.css(css_selector).css("a::attr(href)").extract_first()  
        yield scrapy.Request(response.urljoin(med_details_link), 
                             callback=self.parse_med, meta = {'drug' : med})
        
    def parse_med(self, response):
        med = response.meta['drug']
        css_selector = 'a[class="drug-review"]'
        med_reviews_link = response.css(css_selector).css("a::attr(href)").extract_first() + \
                           "&conditionFilter=-1"
        yield scrapy.Request(response.urljoin(med_reviews_link), 
                             callback=self.parse_reviews, meta = {'drug' : med})
        
    def parse_reviews(self, response):
        med = response.meta['drug']
        reviews_dict = {}
        fname = "drug_reviews/" + med + "-reviews.json"
        try:
            with open(fname) as f:
                data = json.load(f)
                last_user_id = max([int(u_id) for u_id in data]) + 1
        except FileNotFoundError:
            data = {}
            last_user_id = 0
        
        # get the condition being treated
        condition_el = response.css('div.userPost div.conditionInfo').extract()
        # get the duration of the treatment with a particular drug
        # text sometimes also includes the gender of the user
        # both might be N/A in some cases
        reviewer_info_el = response.css('div.userPost p.reviewerInfo').extract()
        # get the ratings
        effectiveness_el = response.css('div.userPost div[class="catRatings firstEl clearfix"] span[class="current-rating"]').extract()
        easeofuse_el = response.css('div.userPost div[class="catRatings clearfix"] span[class="current-rating"]').extract()
        satisfaction_el = response.css('div.userPost div[class="catRatings lastEl clearfix"] span[class="current-rating"]').extract()
        # get the full text of the user review
        userreview_el = response.css('div.userPost p[id^="comFull"]').extract()
        
        reviews = zip(condition_el, reviewer_info_el, effectiveness_el, easeofuse_el, satisfaction_el, userreview_el)
        for u_id, (con_el, ri_el, eff_el, eou_el, sat_el, ur_el) in enumerate(reviews):
            condition = re.search("Condition: (.*)\<\/div\>", con_el).group(1).strip()
            reviewer_info = re.search('Reviewer:(.*)\<\/p\>', ri_el).group(1).strip()
            effectiveness = int(re.search('Current Rating:(.*)\<\/span\>', eff_el).group(1).strip())
            easeofuse = int(re.search('Current Rating:(.*)\<\/span\>', eou_el).group(1).strip())
            satisfaction = int(re.search('Current Rating:(.*)\<\/span\>', sat_el).group(1).strip())
            rating = {'effectiveness': effectiveness,
                      'ease of use': easeofuse,
                      'satisfaction': satisfaction}
            userreview = re.search('\<\/strong\>\<br\>(.*)\<br\>', ur_el.replace("\r", "").replace("\n","")).group(1).strip()
            current_user_id = u_id + last_user_id
            reviews_dict[current_user_id] = {'condition': condition, 
                                             'reviewer info': reviewer_info, 
                                             'rating': rating, 
                                             'review': userreview}
        if data:
            data.update(reviews_dict)
        else:
            data = reviews_dict
        with open(fname, 'w') as f:
            json.dump(data, f)
        next_page = response.css("""div[class="postSortPaging clearfix"] a[onclick="ctrs('dr-pagenum_next')"]""").css("a::attr(href)").extract_first()
        if next_page is not None:
            next_page = re.search('aspx(.*)', next_page).group(1)
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse_reviews, meta={'drug': med})
            
process = CrawlerProcess()

process.crawl(ReviewsSpider)
process.start()

2018-01-16 09:15:44 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: scrapybot)
2018-01-16 09:15:44 [scrapy.utils.log] INFO: Versions: lxml 3.5.0.0, libxml2 2.9.2, cssselect 1.0.3, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 3.5.2 |Anaconda custom (x86_64)| (default, Jul  2 2016, 17:52:12) - [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)], pyOpenSSL 17.0.0 (OpenSSL 1.0.2g  1 Mar 2016), cryptography 1.7.1, Platform Darwin-17.3.0-x86_64-i386-64bit
2018-01-16 09:15:44 [scrapy.crawler] INFO: Overridden settings: {}
2018-01-16 09:15:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.memusage.MemoryUsage']
2018-01-16 09:15:44 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloade