In [1]:
import pandas as pd
import requests
import json
import time
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import mimetypes

In [2]:
load_dotenv('google.env')

True

In [3]:
directory = pd.read_csv('../data/HD2024.csv')

In [65]:
def search_directory(query):
    query = query.lower()
    term_checker = lambda row: query in row.lower() if type(row) == str else False

    search_results = pd.concat([
        directory[directory['INSTNM'].apply(term_checker)],
        directory[directory['IALIAS'].apply(term_checker)]
    ]).drop_duplicates().reset_index(drop=True)

    return search_results

In [None]:
def search_directory_ui(query):
    temp = search_directory(query)
    return temp[['UNITID', 'INSTNM', 'STABBR']].to_dict(orient='records')

In [76]:
def param_string(parameters):
    string = ''
    for key in parameters:
        string += '&' + str(key) + '=' + str(parameters[key])
    return string

def assemble_query(parameters):
    url = f'https://customsearch.googleapis.com/customsearch/v1?cx={os.getenv("ENGINE_ID")}&key={os.getenv("API_KEY")}'
    url += param_string(parameters)
    url = url.replace(' ','+')
    return url

def send_query(query):
    data = requests.get(query)
    time.sleep(1)
    if data.status_code == 200:
        data = data.json()
        if 'items' in data:
            results = pd.DataFrame(data['items'])
            results['Query'] = query
            return results
    else:
        raise ValueError(f'query failed with status: {data.status_code}')

In [203]:
def retrieve_cds(unitid):
    curr_college = directory[directory['UNITID'].apply(lambda x: str(x)) == unitid].reset_index(drop=True)

    query = assemble_query({
        'siteSearch': curr_college['WEBADDR'][0],
        'siteSearchFilter': 'i',
        'q': 'common data set',
        'fileType': 'pdf'
    })

    results = send_query(query)

    return results[['htmlTitle', 'link']].to_dict(orient='records')

In [None]:
def frame_url(url):
    if url is None:
        return ''
    else:
        return f'<a href="{url}">View Document</a>'

In [None]:
def retrieve_propublica_summary(unitid):
    curr_college = directory[directory['UNITID'] == unitid].reset_index(drop=True)
    ein = curr_college['EIN'][0]

    url = f'https://projects.propublica.org/nonprofits/api/v2/organizations/{ein}.json'

    data = requests.get(url).json()

    df = pd.DataFrame(data['filings_with_data'])

    df = df[['tax_prd_yr', 'totrevenue', 'totfuncexpns', 'totassetsend', 'totliabend', 'pdf_url']].rename({
        'tax_prd_yr': 'Year',
        'totrevenue': 'Total revenue',
        'totfuncexpns': 'Total expenses',
        'totassetsend': 'Total assets, end of year',
        'totliabend': 'Total liabilities, end of year',
        'pdf_url': 'Original Filing'
    }, axis=1)

    df['Original Filing'] = df['Original Filing'].apply(frame_url)

    return df.to_html()

In [204]:
retrieve_cds('204501')

[{'htmlTitle': '<b>Common Data Set</b> 2024-2025',
  'link': 'https://www.oberlin.edu/media/34940/download?inline'},
 {'htmlTitle': 'Updated CDS AB 7.10.xlsx - Group',
  'link': 'https://www.oberlin.edu/sites/default/files/content/office/institutional-research/documents/2023-24_oc_cds.pdf'},
 {'htmlTitle': 'RStudio IDE Cheat Sheet',
  'link': 'https://www.oberlin.edu/sites/default/files/content/biography/faculty_docs/Jeff_Witmer/rstudio.pdf'},
 {'htmlTitle': 'Instructions for Form I-9, Employment Eligibility Verification',
  'link': 'https://www.oberlin.edu/sites/default/files/2025-06/I-9instr%20%283%29_1.pdf'},
 {'htmlTitle': 'Automating Reading Digital and Analog Meters using Machine ...',
  'link': 'https://www.oberlin.edu/sites/default/files/content/arts-and-sciences/departments/physics/student_projects/2021/thesis_loubna_el_meddah_el_idrissi.pdf'},
 {'htmlTitle': 'ggplot2-cheatsheet-2.0.pdf - <b>Data</b> Visualization',
  'link': 'https://www.oberlin.edu/sites/default/files/conten

In [231]:
query = assemble_query({
    'siteSearch': 'oberlin.edu',
    'siteSearchFilter': 'i',
    'q': 'common data set'
})

results = send_query(query)

In [233]:
results['link'][0]

'https://www.oberlin.edu/institutional-effectiveness/institutional-research-analytics/common-data-set'

In [220]:
query = assemble_query({
    'siteSearch': 'https://www.reed.edu/ir/cds/cdsindex.html',
    'siteSearchFilter': 'i',
    'q': 'common data set',
    'fileType': 'pdf'
})

results = send_query(query)

In [234]:
url = 'https://www.oberlin.edu/institutional-effectiveness/institutional-research-analytics/common-data-set'

In [235]:
soup = BeautifulSoup(requests.get(url).text)

In [250]:
anchors = soup.select('a')

In [239]:
url = 'https://oberlin.edu/media/20096/download'

In [249]:
def get_content_type(url):
    response = requests.get(url)
    return response.headers['content-type']

In [266]:
def is_pdf_anchor(anchor, base):
    if 'href' in anchor.attrs:
        url = anchor.attrs['href']
        if 'https' not in url:
            url = base + url
        response = requests.get(url)
        if response.status_code == 200:
            return response.headers['content-type'] == 'application/pdf'
    return False

In [258]:
anchors[1].attrs['href']

'#main-content'

In [267]:
[is_pdf_anchor(anchor, 'https://oberlin.edu/') for anchor in anchors]

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa