In [None]:
import numpy as np
import pandas as pd
import requests
import re
import warnings
import urllib

from tqdm import tqdm
tqdm.pandas(desc="Progress")

warnings.simplefilter(action='ignore', category=FutureWarning)

### Handbook URL Source Data

In [None]:
hbooks = pd.read_csv('./data/handbook_url_search.csv')

### Function definitions

In [None]:
def get_params(url_segment):
    '''
    Extracts key-value pairs from a string formatted 'key=value&key=value&...'
    Accepts: str
    Returns: dict
    '''
    params = {}
    seg_split = url_segment.split('&')
    for item in seg_split:
        param_split = re.split(r'=', item, re.IGNORECASE)
        params[param_split[0]] = param_split[1]
    return params

In [None]:
# test1 = 'usp=sharing'
# test2 = 'usp=sharing&start=false&loop=false'

# print(get_params(test1))
# print(get_params(test2))

In [None]:
def longest_str(str_list):
    '''
    Returns the longest string in a list of strings
    Accepts: list of str
    Returns: str
    '''
    i_max = 0
    for i in range(len(str_list)):
        if len(str_list[i]) > len(str_list[i_max]):
            i_max = i
    return str_list[i_max]

In [None]:
# str0 = ['', 'document', 'd', '1qswHw9m_3ZjzgPcL4Ud2cpSCqF0fj9FRieuwDBOYygo', 'edit']
# longest_str(str0)

In [None]:
def parse_url(url):
    url_segs = {}
    params = {}
    
    m_beg = re.search(r'(\w+)://([\w\-\.]+)[\.com|\.net|\.org]', url)
    if m_beg:
        url_segs['domain'] = m_beg.group()
    
    m_end = re.search(r'[\?|#].*', url, re.IGNORECASE)
    if m_end:
        param_list = m_end.group()
        if '=' in param_list:
            params = get_params(param_list[1:])

    if m_beg and m_end:
        if 'url' in params.keys():
            url_segs['doc_id'] = None
        elif 'id' in params.keys():
            url_segs['doc_id'] = params['id']
        else:
            m_mid = re.split(r'/', url[m_beg.span()[1]:m_end.span()[0]])
            if len(longest_str(m_mid)) >= 20:
                url_segs['doc_id'] = longest_str(m_mid)
            else:
                url_segs['doc_id'] = None
    else:
        m_mid = re.split(r'/', url[m_beg.span()[1]:])
        if len(longest_str(m_mid)) >= 20:
            url_segs['doc_id'] = longest_str(m_mid)
        else:
            url_segs['doc_id'] = None
        
    return {**url_segs, **params}

In [None]:
# url0 = 'https://drive.google.com/drive/folders/18MvvHcv1M3ePoZQXqCXQKwjOiSrsd4hC'
# url1 = 'https://drive.google.com/file/d/1kqI5H-NHTc7hwxWG5_cHjOwMYYnc69wS/view'
# url2 = 'https://docs.google.com/document/d/1qswHw9m_3ZjzgPcL4Ud2cpSCqF0fj9FRieuwDBOYygo/edit?usp=sharing'
# url3 = 'https://docs.google.com/a/fayette.k12.al.us/document/d/146GiYW5gDxRLppGD-Y23Ga-HA3g0wqo6kC-cTVzGWrw/edit?usp=drive_web'
# url4 = 'https://docs.google.com/viewerng/viewer?url=https://www.walkercountyschools.com//cms/lib/AL02210233/Centricity/Domain/75/2017-18%2520Handbook.pdf'
# url5 = 'https://docs.google.com/document/d/1Nrz_WtoLC010eigdM2s8muvUbGsqGPRSQzp6YNte3JE/edit#heading=h.aze8y9dllu8'

# print(parse_url(url0))
# print(parse_url(url1))
# print(parse_url(url2))
# print(parse_url(url3))
# print(parse_url(url4))
# print(parse_url(url5))

In [None]:
def doc_type(url):
    url = str(url)
    match_drive = re.search(r'\.google\.com', url, re.IGNORECASE)
    match_pdf = re.search(r'\.pdf', url, re.IGNORECASE)
    match_doc = re.search(r'\.doc|\.rtf', url, re.IGNORECASE)
    match_web = re.search(r'http', url, re.IGNORECASE)
    if match_drive:
        return 'gdrive'
    elif match_pdf:
        return 'pdf'
    elif match_doc:
        return 'doc'
    elif match_web:
        return 'web'
    else:
        return None

In [None]:
# hbooks = pd.read_csv('./data/handbook_url_search.csv')
# temp = hbooks.sample(100, random_state = 234).reset_index(drop=True).dropna()

In [None]:
# temp['doc_type'] = temp['handbooks'].progress_apply(lambda x: doc_type(x))
# temp.head()

In [None]:
# temp.groupby(['doc_type']).count()

In [None]:
def get_drive_id(url):
    '''
    Wrapper to extract the Google doc ID from a url
    Accepts: dictionary
    Returns: str
    '''
    try:
        parsed_url = parse_url(url)
        if parsed_url.get('id'):
            return parsed_url.get('id')
        elif parsed_url.get('pid') == 'sites' and parsed_url.get('srcid'):
            return parsed_url.get('srcid')
        else:
            return parsed_url.get('doc_id')
    except Exception as e:
        print((url, e))

In [None]:
def encode_url(url):
    '''
    Translates an percent-encoded string to HTTP url encoding
    Accepts: str
    Returns: str
    '''
    if isinstance(url, str):
        return urllib.parse.unquote(url, encoding='utf-8', errors='replace')

### Get Google drive IDs
Note: Some schools upload PDFs or Word docs to Google Drive; these do not have IDs. However, their URLs are specified in the handbook url string. The final dataset should have two columns, one for Google Drive ID if present (to pass to Google Drive API in the next step) and another one for the document URL for those Google Drive links without IDs.

In [None]:
hbooks['doc_type'] = hbooks['handbooks'].progress_apply(lambda x: doc_type(x))

In [None]:
hbooks.groupby(['doc_type']).count()

In [None]:
gdocs = hbooks[hbooks['doc_type'] == 'gdrive'].drop(['index'], axis=1).copy().reset_index(drop=True)

In [None]:
# Get google drive IDs from handbooks created in Google Workspaces
gdocs['doc_id'] = gdocs['handbooks'].progress_apply(lambda x: get_drive_id(x))

# Extract urls for files uploaded to Google Drive but not created in Google Workspaces
gdocs['doc_url'] = gdocs['handbooks'].progress_apply(lambda x: parse_url(x).get('url'))

# Fix url encoding
gdocs['doc_url'] = gdocs['doc_url'].progress_apply(lambda x: encode_url(x))

In [None]:
gdocs.head()

In [None]:
gdocs.to_csv('./data/handbook_google_docs.csv', index=False)

### Get handbooks via Google Drive API

Install Google Drive API python client (Optional)
https://developers.google.com/docs/api/quickstart/python

Download files
https://developers.google.com/drive/api/guides/manage-downloads#python

Note: In Google Cloud Console, Google Drive API must be enabled.

Install the Python client from the command line
```
pip3 install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
```

Python client documentation
https://github.com/googleapis/google-api-python-client/tree/main/docs

In [None]:
import googleapiclient, httplib2, oauth2client
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account

In [None]:
credentials = service_account.Credentials.from_service_account_file(
    '/home/nb775_georgetown_edu/auth/gcp-gu-ppalab-b168ee778ab5.json')

scoped_credentials = credentials.with_scopes(['https://www.googleapis.com/auth/drive.readonly'])

In [None]:
drive_service = build('drive', 'v3', credentials=credentials)

test_url = 'https://drive.google.com/file/d/163QosK8EuTWmFFkyl6yAWb6bVfzOBb8I/view?usp=sharing'
file_id = get_drive_id(test_url)

# file_id = '1qswHw9m_3ZjzgPcL4Ud2cpSCqF0fj9FRieuwDBOYygo'

with open('test.pdf', 'wb') as fh:
    try:
        request = drive_service.files().export_media(fileId=file_id, mimeType='application/pdf')
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print( "Download %d%%." % int(status.progress() * 100))
    except HttpError:
        request = drive_service.files().get_media(fileId=file_id)
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print( "Download %d%%." % int(status.progress() * 100))

drive_service.close()