In [74]:
import pandas as pd
import requests
import json
import time
from dotenv import load_dotenv
import os

In [75]:
load_dotenv('google.env')

True

In [3]:
directory = pd.read_csv('../data/HD2024.csv')

In [65]:
def search_directory(query):
    query = query.lower()
    term_checker = lambda row: query in row.lower() if type(row) == str else False

    search_results = pd.concat([
        directory[directory['INSTNM'].apply(term_checker)],
        directory[directory['IALIAS'].apply(term_checker)]
    ]).drop_duplicates().reset_index(drop=True)

    return search_results

In [76]:
def param_string(parameters):
    string = ''
    for key in parameters:
        string += '&' + str(key) + '=' + str(parameters[key])
    return string

def assemble_query(parameters):
    url = f'https://customsearch.googleapis.com/customsearch/v1?cx={os.getenv("ENGINE_ID")}&key={os.getenv("API_KEY")}'
    url += param_string(parameters)
    url = url.replace(' ','+')
    return url

def send_query(query):
    data = requests.get(query)
    time.sleep(1)
    if data.status_code == 200:
        data = data.json()
        if 'items' in data:
            results = pd.DataFrame(data['items'])
            results['Query'] = query
            return results
    else:
        raise ValueError(f'query failed with status: {data.status_code}')

In [None]:
def retrieve_cds(unitid):
    curr_college = directory[directory['UNITID'] == unitid].reset_index(drop=True)

    query = assemble_query({
        'siteSearch': curr_college['WEBADDR'][0],
        'siteSearchFilter': 'i',
        'q': 'common data set',
        'fileType': 'pdf'
    })

    results = send_query(query)

    return results[['htmlTitle', 'link']].to_dict(orient='records')

In [None]:
def frame_url(url):
    if url is None:
        return ''
    else:
        return f'<a href="{url}">View Document</a>'

In [None]:
def retrieve_propublica_summary(unitid):
    curr_college = directory[directory['UNITID'] == unitid].reset_index(drop=True)
    ein = curr_college['EIN'][0]

    url = f'https://projects.propublica.org/nonprofits/api/v2/organizations/{ein}.json'

    data = requests.get(url).json()

    df = pd.DataFrame(data['filings_with_data'])

    df = df[['tax_prd_yr', 'totrevenue', 'totfuncexpns', 'totassetsend', 'totliabend', 'pdf_url']].rename({
        'tax_prd_yr': 'Year',
        'totrevenue': 'Total revenue',
        'totfuncexpns': 'Total expenses',
        'totassetsend': 'Total assets, end of year',
        'totliabend': 'Total liabilities, end of year',
        'pdf_url': 'Original Filing'
    }, axis=1)

    df['Original Filing'] = df['Original Filing'].apply(frame_url)

    return df.to_html()

In [96]:
result = {}

In [97]:
search_results = search_directory('Reed College')

In [98]:
unitid = 209922

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Year</th>\n      <th>Total revenue</th>\n      <th>Total expenses</th>\n      <th>Total assets, end of year</th>\n      <th>Total liabilities, end of year</th>\n      <th>Original Filing</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2023</td>\n      <td>5921562389</td>\n      <td>5495599586</td>\n      <td>7027102373</td>\n      <td>2925918685</td>\n      <td>&lt;a href="https://projects.propublica.org/nonprofits/download-filing?path=IRS%2F590624458_202305_990_2024042322370929.pdf"&gt;View Document&lt;/a&gt;</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2022</td>\n      <td>5466904190</td>\n      <td>4996561078</td>\n      <td>6582591173</td>\n      <td>2847537255</td>\n      <td>&lt;a href="https://projects.propublica.org/nonprofits/download-filing?path=download990pdf_06_2023_prefixes_58-66%2F590624458_202205_990_2023060621383044.pd