# Data Collection

## Part 1: Web Scraping

In the following section, I scrape the Mayo Clinic's Symptoms and Causes pages under all of their indexed diseases and conditions.

In [None]:
# import the necessary libraries
import sys
import requests
from bs4 import BeautifulSoup
from string import ascii_uppercase as upp
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

The first step is to get the long list of symptoms-causes URLs that are linked from the Mayo Clinic's indexed Diseases and Conditions lookup. I begin by saving the url up to and including the query for the letter, but not the letter itself. I also save the root of the URL for use later.

In [None]:
# get URLs: diseases-conditions pages organized in an alphabetical index, with one # entry
url = 'https://www.mayoclinic.org/diseases-conditions/index?letter='
root = 'https://www.mayoclinic.org'

Next, I write a function to extract the URLs of the pages I want from the letter index pages.

In [None]:
def extract(letter, addresses):
    index = url + letter # url of index page for this letter
    get = requests.get(index).content
    soup = BeautifulSoup(get, "lxml")
    # get list of articles on index page
    within = soup.find_all(class_ = "index content-within")
    for elm in within:
        # gets letter's articles as strings in a list
        links = re.findall("(?<=a\shref=\").*?(?=\">)", str(elm))
        # for each of the links
        for page in range(len(links)): 
            full = root + links[page] # symptoms-causes URL
            if addresses.count(full)<1:
                addresses.append(full)

I use the function to extract the URL from the # index page, the only index item that isn't listed under a capital letter of the alphabet. This helps me to ensure the function is behaving correctly without scraping too much.

In [None]:
addresses = []
extract("0", addresses)
addresses

Now that I know it works, I can extract the other URLs:

In [None]:
for letter in upp:
    extract(letter, addresses)

Now it's time to put these URLs into an initial dataframe that I will merge with the relevant Spider and Moz data.

In [None]:
mayo_data = pd.DataFrame(addresses, columns=["url"])
mayo_data.head()

In [None]:
mayo_data.shape

## Part 2: SEO Spider Data

#### Raw Source Data: https://drive.google.com/open?id=1vlTTVOf3L2TnJxRma4TyJPCsgKpVvM19

Below are all of the columns provided by SEO Spider, not all of which will be useful for my purposes.

In [None]:
raw = pd.read_csv("symptoms-causes.csv")
raw.head(1)

In [None]:
raw.columns

I select which columns I want to keep:

In [None]:
trimmed = raw[["URL Encoded Address", 'H1-1', 'H1-1 length', "Meta Description 1", 'Meta Description 1 Length', 'Size (bytes)', "Word Count", "Inlinks", "Unique Inlinks", 'Outlinks', 'Unique Outlinks', 'External Outlinks',
       'Unique External Outlinks']]
trimmed.head()

...and convert them into more coding-friendly formats:

In [None]:
# lowercase, spaces to underscores
new_colnames = [x.lower() for x in trimmed.columns]
new_colnames = [x.replace(' ', '_') for x in new_colnames]

# replace in original dataframe
cleaned = trimmed
cleaned.columns = new_colnames

# replace individual column names that need modifying
cleaned = cleaned.rename(columns = {'url_encoded_address' : 'url'})
cleaned = cleaned.rename(columns = {'h1-1': 'header'})
cleaned = cleaned.rename(columns = {'h1-1_length': 'header_len'})
cleaned = cleaned.rename(columns = {'meta_description_1' : 'meta'})
cleaned = cleaned.rename(columns = {'meta_description_1_length' : 'meta_len'})
cleaned = cleaned.rename(columns = {'size_(bytes)' : 'bytes'})
cleaned = cleaned.rename(columns = {'unique_inlinks' : 'unique_in'})
cleaned = cleaned.rename(columns = {'unique_outlinks' : 'unique_out'})
cleaned = cleaned.rename(columns = {'external_outlinks' : 'ext_links'})
cleaned = cleaned.rename(columns = {'unique_external_outlinks' : 'unique_ext'})

# reformat headers
lower = [x.lower() for x in cleaned["header"]]
cleaned["header"] = lower

cleaned.head(3)

In [None]:
cleaned.shape

## Part 3: Merge Datasets

In [None]:
# inner merge to get all URLs scraped from Mayo Clinic site that were included in the SEO Spider crawl
data = pd.merge(mayo_data, cleaned, on='url', how='inner')
data.head(3)

In [None]:
data.shape

In [None]:
# check for any duplicate URLs
data = data[data["url"].duplicated() == False]

In [None]:
# no duplicates, but we did lose some URLs that were scraped from the Mayo Clinic site but weren't accessed in the crawl.
data.shape

In [None]:
#check for any NaNs in the dataset
data.isnull().values.any()

In [None]:
#check where
data.isnull().any()

In [None]:
# check which it is
data[data["meta"].isnull()]

In [None]:
# set null meta description to empty instead of null (without making a copy)
data.loc[data['meta'].isnull(), 'meta'] = " "

In [None]:
data[data["header"]=="tapeworm infection"]

In [None]:
# make sure that was it
data.isnull().values.any()

## Part 4: Moz Data
### Top 500 Ranking Pages on the Mayo Clinic Domain

In [None]:
moz = pd.read_csv("moz-top-pages.csv")
moz.head()

In [None]:
# lowercase, spaces to underscores
colnames = [x.lower() for x in moz.columns]
colnames = [x.replace(' ', '_') for x in colnames]

# replace in original dataframe
moz.columns = colnames

In [None]:
# make sure url format matches
urls = []
for row in range(len(moz["url"])):	
    urls.append("https://"+moz["url"][row])

moz["url"]=urls

In [None]:
moz.head()

In [None]:
# see how many pages we keep if we merge this data with the larger dataset of symptoms and causes pages
experiment = pd.merge(data, moz, on='url', how='inner')
experiment.shape

In [None]:
# get rid of any duplicate URLs
experiment = experiment[experiment["url"].duplicated() == False]

In [None]:
# by merging the datasets, and including only top ranking pages, I've lost a lot of data
experiment.shape

In [None]:
# check the range of the page authority variable that I wanted
experiment['pa'].min()

In [None]:
# 60-67 is something, since these rankings are logarithmic
# and therefore it's harder to increase page rank as you near 100,
experiment['pa'].max()

In [None]:
# but the variation between successful pages is wider than it seems
experiment['pa'].var()

Less than half of the 500 top-ranking pages on the Mayo Clinic website are Symptoms and Causes pages. Of those pages, the range of Page Authority ranking out of 100 is from 60-67, not very wide. This is a very small sample with not much variance, and I have similar data already from SEO Spider, so I will use the data from Moz about Page Authority in a different way. What I want to know is which pages in my dataset are in the top 500, and since the Page Authority scores of those pages are fairly close together, I will mark them as top-ranking with a 1 (True), and the rest of the pages in the dataset with a 0 (False). This way I am not losing very much information, and I am keeping my sample size much closer to the population size.

It would be most desirable if I had the page authority ranking from Moz for all of the Mayo Clinic pages I am interested in looking at, but unfortunately they only provide the top 500 for the domain.

In [None]:
experiment.columns

In [None]:
# create smaller dataset with page authority
top_data = experiment[["url", 'pa', 'header', 'header_len', 'meta', 'meta_len', 'bytes',
       'word_count', 'total_links', 'linking_domains_to_page', 'ext_links', 'unique_ext',
        'inlinks', 'unique_in', 'outlinks', 'unique_out']]

In [None]:
# rename column
top_data = top_data.rename(columns = {'linking_domains_to_page' : 'inbound_domains'})

In [None]:
top_data.head(3)

## Part 5: More Web Scraping (publication date)

In [None]:
dates = []

for page in top_data["url"]:
    content = requests.get(page).content # page content
    file = BeautifulSoup(content, "lxml") # in lxml
    date = file.find("div", class_='pubdate')
    if(date!=None):
        match = re.findall("(?<=\\r\\n).*?(?=\\r\\n)", str(date.get_text()))[0].strip() # get rid of \r\n and spaces
        dates.append(match) # add to column list
    else:
        dates.append(None)

# add new column to dataframe
top_data['pub_date'] = dates
top_data.head()

In [None]:
# check if any didn't get scraped
data.isnull().values.any()

In [None]:
# convert strings to comparable datetime objects
import datetime
# jan, feb, aug, sept, oct, nov, dec
dat = []
for date in top_data["pub_date"]:
    if type(date)==str: # if it isn't null
        if "." in date:
            if "Sept" in date: #special case: datetime recognizes "Sep" not "Sept"
                date = re.sub('t', '', date)
            datetime_ob = datetime.datetime.strptime(date, '%b. %d, %Y')
        else:
            datetime_ob = datetime.datetime.strptime(date, '%B %d, %Y')
        dat.append(datetime_ob)
    else:
        dat.append(None)

top_data.pub_date = dat

In [None]:
top_data.head()

In [None]:
top_data.shape

## Part 6: Save dataset

In [None]:
top_data.to_csv('ranked.csv')