In [None]:
import pandas as pd
import numpy as np

# API access
import requests
import warnings
import json

# Reading a website
from bs4 import BeautifulSoup
import re

# Progress bar and timing
from tqdm import tqdm
tqdm.pandas(desc="Progress")

warnings.simplefilter(action='ignore', category=FutureWarning)

import shutil
import os.path

Note: I manually created the `handbook` folder and their supplemental subfolders as it was easier to do that at this point rather than creating those folders in the code.

## Function definitions

In [None]:
def download_doc(df):
    '''
    The purpose of this function is to download handbooks with .doc file type.
    
    Inputs - 
    df: dataframe
    
    Outputs - 
    the dataframe with a status column
    '''
    doc_status = []
    
    for doc in tqdm(range(len(df))):
        if df['handbook_type'].loc[doc] == 'doc':
            response = requests.get(df['handbooks'].loc[doc])
            try:
                d_name = r'./handbooks/doc/%i.doc' % df['NCESSCH'].loc[doc]

                # Checking if the file already exists
                filename, extension = os.path.splitext(d_name)
                i = 1
                
                while os.path.exists(d_name):
                    d_name = filename + '_' + str(i) + extension
                    i += 1

                with open(d_name, 'wb') as t:
                    t.write(response.content)
                
                doc_status.append('Complete')
            except Exception as e:
                pdf_status.append('Failed:' + str(e))
        else:
            doc_status.append(np.nan)
    
    df['status'] = doc_status

In [None]:
def download_pdf(df):
    '''
    The purpose of this function is to download the handbooks with a .pdf file type.
    df is the full dataframe. 
    '''
#     df = df[df['handbook_type'] == 'PDF'].reset_index(drop = True)
    pdf_status = []
    
    for pdf in tqdm(range(len(df))):
        if df['handbook_type'].loc[pdf] == 'pdf':
            response = requests.get(df['handbooks'].loc[pdf])
            try:
                p_name = r'./handbooks/pdf/%i.pdf' % df['NCESSCH'].loc[pdf]
                
                # Checking if the file already exists
                filename, extension = os.path.splitext(p_name)
                i = 1
                
                while os.path.exists(p_name):
                    p_name = filename + '_' + str(i) + extension
                    i += 1
                    
                with open(p_name, 'wb') as t:
                    t.write(response.content)
                
                pdf_status.append('Complete')
            except Exception as e:
                pdf_status.append('Failed:' + str(e))
        else: 
            pdf_status.append(df['status'].loc[pdf])
    
    df['status'] = pdf_status

In [None]:
# def download_html(df):
#     '''
#     The purpose of this function is to download the handbooks with a .html file type.
#     html_url represents a url link to the handbook, and html_name is the name of the downloaded 
#     handbook (school name).
#     '''
#     df = df[df['handbook_type'] == 'HTML'].reset_index(drop = True)
#     html_status = []
        
#     for html in tqdm(range(len(df))): 
#         response = requests.get(df.iloc[html,9])
#         response.encoding = 'utf-8'
#         try:
#             with open(df.iloc[html,1] + '.html', 'w', encoding = 'utf-8') as t:
#                 t.write(response.text)
#             shutil.move(df.iloc[html,1] + '.html', r'./handbooks/html')
#             html_status.append('Complete')
#         except:
#             html_status.append('Failed')
#     df['status'] = html_status

## Testing

In [None]:
# Reading in 'sample_df'
sample_df = pd.read_csv('./files/sample_df.csv', usecols = ['SCH_NAME', 'NCESSCH', 'final_website', 
                                                            'handbooks', 'handbook_type'])

In [None]:
# Changing 'NCESSCH' from float to integer
# sample_df['NCESSCH'] = sample_df['NCESSCH'].astype(str)

In [None]:
# Inserting two sample doc rows
sample_df.loc[135] = ['Test', 12345, 'https://gradschool.wsu.edu/', 'https://gradschool.wsu.edu/documents/2017/02/student-handbook-template.docx/', 'doc']

sample_df.loc[136] = ['Test', 12345, 'https://gradschool.wsu.edu/', 'https://www.uab.edu/shp/hsa/images/documents/HCM/student-handbook.docx', 'doc']

# Two sample pdfs
sample_df.loc[137] = ['Test', 12346, 'https://webster.edu/', 'https://webster.edu/documents/student-handbook/webster-student-handbook.pdf', 'pdf']

sample_df.loc[138] = ['Test', 12346, 'https://webster.edu/', 'https://www.fletc.gov/sites/default/files/Glynco%20Student%20Handbook%2027Mar15.pdf', 'pdf']

In [None]:
# Testing the 2 download functions
download_doc(sample_df)

download_pdf(sample_df)

In [None]:
sample_df.to_csv('./files/sample_status.csv', index=False)
sample_df

In [None]:
# Creating a stratified sample for testing
# temp = handbook_urls.groupby('handbook_type', group_keys = False).apply(lambda x: x.sample(5))