In [0]:
!apt install chromium-chromedriver
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
from bs4 import BeautifulSoup
from google.colab import files
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [0]:
driver = webdriver.Chrome('chromedriver',options=options)
driver.get("http://www.pharmcas.org/school-directory/#/pharmd/general-information")
driver.implicitly_wait(10)

In [0]:
numbers = driver.find_elements(By.XPATH, 
          '//table[@class="phc-schools-table phc-schools-table-pharmd"]//a')
links = []
for n in numbers:
    number = n.get_attribute('href')
    links.append(number)

len(links)

In [0]:
def pharm_crawl(a_list, data_list):
    headers = {'user-agent': 'jobscraper - school project (gndumbri@gmail.com)'}
    for i in a_list:
        url = i
        response = requests.get(url, headers=headers)
        if response.ok:
            data = response.text

            soup = BeautifulSoup(data, 'html.parser')

            alert = soup.select('aside:nth-of-type(1)')
            alerts = [al.get_text() for al in alert]
            all_alerts = ''.join(alerts).strip()
            all_alerts = all_alerts.replace('\n                        ', '')
            
            name = soup.select('h1')
            names = [nom.get_text() for nom in name]
            all_names = ''.join(names[0]).strip()

            if soup.find("span", itemprop="addressRegion") is not None:
                state = soup.find("span", itemprop="addressRegion").get_text()
            else:
                state = 'N/A'

            deadline = soup.select('.deadline > table > tbody > tr > \
            td:nth-of-type(2)')
            deadlines = [dead.get_text() for dead in deadline]
            all_deadlines = ''.join(deadlines)

            seat = soup.select('.prog-statistics > table > tbody > \
            tr:nth-of-type(1) > td:nth-of-type(2)')
            seats = [spot.get_text() for spot in seat]
            all_seats = ''.join(seats)

            update = soup.select('aside:nth-of-type(2)')
            updates = [up.get_text() for up in update]
            all_updates = ''.join(updates).strip()

            private = soup.select('.prog-information > table > tbody > \
            tr:nth-of-type(3) > td:nth-of-type(2)')
            statuses = [status.get_text() for status in private]
            all_statuses = ''.join(statuses).strip()

            early = soup.select('.prog-information > table > tbody > \
            tr:nth-of-type(11) > td:nth-of-type(2)')
            early_dec = [first.get_text() for first in early]
            all_early = ''.join(early_dec)
            
            ea = soup.select('.prog-information > table > tbody > \
            tr:nth-of-type(12) > td:nth-of-type(2) > ul > li')
            eas = [e_a.get_text() for e_a in ea]
            all_eas = ''.join(eas)

            min_ovr = soup.select('.prog-criteria > table > tbody > \
            tr:nth-of-type(1) > td:nth-of-type(2)')
            mins = [ovr.get_text() for ovr in min_ovr]
            all_min_ovr = ''.join(mins)

            min_req = soup.select('.prog-criteria > table > tbody > \
            tr:nth-of-type(2) > td:nth-of-type(2)')
            reqs = [req.get_text() for req in min_req]
            all_min_req = ''.join(reqs)

            hour = soup.select('.prog-prerequisites > table:nth-of-type(1) > \
            tbody > tr:nth-of-type(1) > td:nth-of-type(2)')
            hours = [our.get_text() for our in hour]
            all_hours = ''.join(hours)

            pcat = soup.select('.test > table > tbody > tr:nth-of-type(1) > \
            td:nth-of-type(2) > p')
            scores = [score.get_text() for score in pcat]
            all_scores = ''.join(scores).strip()

            pcat_score = soup.select('.test > table > tbody > \
            tr:nth-of-type(2) > td:nth-of-type(2)')
            pcat_scores = [pcs.get_text() for pcs in pcat_score]
            all_pcat_scores = ''.join(pcat_scores)

            lor = soup.select('.letters-of-reference > table > tbody > \
            tr:nth-of-type(2) > td:nth-of-type(2)')
            lors = [lo.get_text() for lo in lor]
            all_lors = ''.join(lors).strip()

            deposit = soup.select('.accepted-applicants > table > tbody > tr > \
            td:nth-of-type(2), table > tbody > tr')
            deposits = [dep.get_text() for dep in deposit]
            all_deps = ((''.join(deposits[0])).replace('\n', ' '))

        data_list.append((all_alerts, all_names, state, all_deadlines, 
                          all_seats, all_updates, all_statuses, 
                          all_early, all_eas, all_min_ovr, all_min_req, 
                          all_hours, all_scores, all_pcat_scores, 
                          all_lors, all_deps))


In [0]:
pharm_crawl(links[11:], data_list)

In [0]:
df = pd.DataFrame(data_list, columns=['Alerts','School', 
                  'State', 'Application Deadline', 
                  'Class Size', 'Updates', 
                  'Inst Type', 'Early Decision',
                  'Specialty Programs', 'Min Overall GPA',
                  'Min Pre-Req GPA', 'Pre-req hours',
                  'PCAT Required', 'Minimum Composite PCAT Score Considered', 
                  'LORs', 'Deposit'])
df.head()

In [0]:
df.to_csv('Pharmacy_School_Surveillance_Program.csv', index = None, 
          header=True, encoding='utf-8')

In [0]:
files.download('Pharmacy_School_Surveillance_Program.csv')