In [None]:
import pandas as pd
import numpy as np
import time

# API access
import requests
import warnings
import json

# Reading a website
from bs4 import BeautifulSoup
import re
import multiprocess
import os
import csv

# Progress bar and timing
from tqdm import tqdm
tqdm.pandas(desc="Progress")

warnings.simplefilter(action='ignore', category=FutureWarning)

### Overview: Scraping sites for handbooks

We are starting with a list of school records, some of which include websites. There are several steps we need to complete for each school:
  1) Obtain the content of the school's website, if it exists
  
  2) Check whether the content contains any links that match our terms of interest (handbook, conduct, etc.)
  
  3) Merge the links we found back into the school records
  
  4) For schools with handbooks, determine whether the link takes us to a webpage or a document we can download, or to multiple documents/webpages
  
  5) Download handbook documents according to format (html, doc, pdf, google doc)

### Function definitions

In [None]:
# Obtain the content of the website, if it exists
def get_url_content(url):
    '''
    Takes a url and requests website contents
    Accepts: str
    Returns: beautiful soup object or str containing error
    '''
    try:
        server_response = requests.get(url, timeout=180)
        soup = BeautifulSoup(server_response.content, 'html.parser')
    except Exception as e: 
        soup = e
    return soup

In [None]:
# Check whether the content contains any liks that match our terms of interest
def term_search(url, soup, regex):
    '''
    Returns a list of urls from a webpage (soup) that match search terms
    Accepts: url str, soup object (or str), regex str
    Returns: list of str
    '''
    links = []
    if isinstance(soup, BeautifulSoup):
        # search for matches in anchor text and/or link text
        anchor_matches = soup.find_all('a', string=re.compile(regex, re.IGNORECASE))
        link_matches = soup.find_all(href=re.compile(regex, re.IGNORECASE))
        
        # combine into one list without duplicates and convert relative paths
        for l in list(set(anchor_matches + link_matches)):
            try:
                if l['href'].startswith('http'):
                    links.append(l['href'])
                elif l['href'].startswith('www.'):
                    links.append('https://' + l['href'])
                elif l['href'].startswith('//www'):
                    links.append('https:' + l['href'])
                else:
                    links.append(url + l['href'])
            except Exception as e:
                links.append(e) 
    return links

In [None]:
# Select the url most likely to be a downloadable handbook
def select_url(links):
    '''
    Return url most likely to be downloadable document
    Accepts: list of str
    Returns: str
    '''
    if len(links) == 0:
        return 'None'
    elif len(links) == 1:
        return links[0]
    else:
        for link in links:
            match_pdf = re.search(r'.pdf', link, re.IGNORECASE)
            match_doc = re.search(r'.doc', link, re.IGNORECASE)
            if match_pdf and len(match_pdf.groups()) == 1:
                return match_pdf.groups()[0]
            elif match_doc and len(match_doc.groups()) == 1:
                return match_doc.groups()[0]
            else:
                return 'Multiple'

In [None]:
def doc_type(url):
    url = str(url)
    match_drive = re.search(r'\.google\.com', url, re.IGNORECASE)
    match_pdf = re.search(r'\.pdf', url, re.IGNORECASE)
    match_doc = re.search(r'\.doc|\.rtf', url, re.IGNORECASE)
    match_web = re.search(r'http|www\.', url, re.IGNORECASE)
    if match_drive:
        return 'gdrive'
    elif match_pdf:
        return 'pdf'
    elif match_doc:
        return 'doc'
    elif match_web:
        return 'web'
    else:
        return None

In [None]:
def get_docs(url_list, doc_list = []):
    docs = []
    
    for url in url_list:
        doctype = doc_type(url)
        if doctype in ['gdrive', 'pdf', 'doc']:
            docs.append(url)
    return [*docs, *doc_list]
            
def get_sites(url_list, site_list = []):
    sites = []
    
    for url in url_list:
        doctype = doc_type(url)
        if doctype == 'web':
            sites.append(url)
    return [*sites, *site_list]

In [None]:
def is_url(url):
    if isinstance(url, str):
        url_split = re.split(' ', url.rstrip())
        if len(url_split) == 1:
            return True
        else:
            return False

In [None]:
def handbook_search(df):
    '''
    Wrapper for term_search() for use with single-argument multiprocessing
    Accepts: Pandas dataframe with 'final_website' column
    Returns: Pandas dataframe
    '''
    search_re = r'handbook|conduct'
    df['handbooks'] = df.final_website.progress_apply(lambda x: term_search(x, get_url_content(x), search_re))
    return df

In [None]:
# def handbook_search_recursive(df):
#     '''
#     Wrapper for term_search() for use with single-argument multiprocessing
#     Accepts: Pandas dataframe with 'source_url' column
#     Returns: Pandas dataframe
#     '''
#     search_re = r'handbook|conduct'
#     df = df.progress_apply(lambda x: recursive_search(depth=3, doc_list=[], site_list=[x['source_url']], 
#                                                       search_re=r'handbook|conduct', search_list=[]),
#                            axis=1, result_type='expand')
#     return df

In [None]:
def handbook_search_recursive(df):
    '''
    Wrapper for term_search() for use with single-argument multiprocessing
    Accepts: Pandas dataframe with 'source_url' column
    Returns: Pandas dataframe
    '''
    search = r'handbook|conduct'
    data_dict = df.to_dict('records')
    for d in tqdm(data_dict):
        with open('output.csv', 'a', newline='') as csvfile:
            fieldnames = ['docs', 'search_list']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            result = recursive_search(depth=3, doc_list=[], site_list=[d.get('source_url')], 
                                      search_re=search, search_list=[])
            d = d.update(result)
            writer.writerow(result)
    return pd.DataFrame.from_records(data_dict)

In [None]:
def parallel_search(df, func, n_cores=11):
    '''
    Wrapper for handbook_search() for use with subsets of a dataframe in parallel
    Accepts: Pandas dataframe with 'final_website' column
    Returns: Pandas dataframe    
    '''
    df_split = np.array_split(df, n_cores)
    pool = multiprocess.Pool(processes=n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
def recursive_search(depth, doc_list, site_list, search_re, search_list):
    if depth == 0 or len(site_list) == 0:
        return {'docs': list(set(doc_list)), 'search_list': list(set(search_list))}
    else:
        new_sites = []
        for url in site_list:
            if url in search_list:
                continue
            search_list.append(url)
            result = term_search(url, get_url_content(url), search_re)
            if len(result) > 0 and len(result) <= 10:
                doc_list = get_docs(result, doc_list)
                doc_list = [doc for doc in doc_list if is_url(doc)]
                new_sites = [*new_sites, *get_sites(result)]
        return recursive_search(depth - 1, doc_list, new_sites, search_re, search_list)

### Testing

In [None]:
# Testing different scenarios
sch0 = 'https://www.duneland.k12.in.us/Domain/13' #(href=0, anchor=0)
sch1 = 'http://www.springlakeparkschools.org/' #handbook is website, not document
sch2 = 'http://www.huntsvillecityschools.org' #(href=0, anchor=1)
sch3 = 'http://www.randolphacademy.org' #(href=1, anchor=1)
sch4 = 'https://www.floraschools.com/FHS/'
sch5 = 'https://www.floraschools.com/fes/handbook.cfm'

# Test get_url_content() and term_search()
# search_re = r'handbook|conduct'
# for school in [sch0, sch1, sch2, sch3, sch4, sch5]:
#     results = term_search(school, get_url_content(school), search_re)
#     print('Results:', results)

In [None]:
# for school in [sch0, sch1, sch2, sch3, sch4, sch5]:
#     print(recursive_search(depth=3, doc_list=[], site_list=[school], 
#                            search_re=r'handbook|conduct', search_list=[]))

In [None]:
test_urls = ['https://www.floraschools.com/fes/handbook.cfm../cms_files/resources/Elementary Handbook-Student 2021-22.doc', 
             'https://www.floraschools.com/FHS/fhs-studenthandbook.cfm../cms_files/resources/FHS Student Handbook 2021-2022.pdf', 
             'https://www.floraschools.com/cms_files/resources/FHS%20Student%20Handbook%202021-2022.pdf']
# for t in test_urls:
#     print(is_url(t))

## Parallelize handbook search

In [None]:
# Reading in the school website data from the previous website scraper
sites = pd.read_csv('./data/school_websites.csv', dtype=object, usecols=['NCESSCH', 'final_website'])
sites = sites.rename({'final_website':'source_url', 'NCESSCH':'school_id'}, axis=1).drop_duplicates().dropna().copy().reset_index(drop=True)

sites.shape

In [None]:
sites.head()

### Testing with sample from website data

In [None]:
# Add new columns to dataframe with list of links and final selection of url (or "Multiple")
# temp = sites.sample(20, random_state=123).reset_index(drop=True)
# search_re = r'handbook|conduct|code'
# temp['links'] = temp['final_website'].progress_apply(lambda x: term_search(x, get_url_content(x), search_re))
# temp['handbooks'] = temp['final_website'].progress_apply(lambda x: select_url(term_search(x, get_url_content(x), search_re)))

In [None]:
with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['docs', 'search_list']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
temp = sites.sample(5, random_state=5).reset_index(drop=True)
all_results = handbook_search_recursive(temp)
all_results

In [None]:
with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['docs', 'search_list']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
temp = sites.sample(30, random_state=10).reset_index(drop=True)

In [None]:
cpu = multiprocess.cpu_count() - 1

t = time.time()
temp = parallel_search(temp, handbook_search_recursive, 10)
print(time.time()-t)

In [None]:
temp

In [None]:
temp.to_csv('./data/handbook_url_search_results_test.csv', index=False)

In [None]:
temp.drop(['search_list'], axis=1).explode('docs').to_csv('./data/handbook_url_docs_test.csv', index=False)

### Application with all school websites

In [None]:
with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['docs', 'search_list']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

In [None]:
cpu = multiprocess.cpu_count() - 1

t = time.time()
docs = parallel_search(sites, handbook_search_recursive, 32)
print(time.time()-t)

In [None]:
docs.to_csv('./data/handbook_url_search_results.csv', index=False)

In [None]:
docs.head()

In [None]:
docs.drop(['search_list'], axis=1).explode('docs').to_csv('./data/handbook_url_docs.csv', index=False)