In [715]:
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd

#### Abilene Christian University as an example school from which to extract features for the final dataset

In [579]:
url = 'https://nces.ed.gov/collegenavigator/?s=all&l=93&ct=2&ic=1&id=222178' 

response = requests.get(url)

In [580]:
response.status_code  #200 = success!

200

In [581]:
response.text[:1000]  #First 1000 characters of the HTML

'\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" ><head id="ctl00_hd"><meta http-equiv="Content-type" content="text/html;charset=UTF-8" /><title>\r\n\tCollege Navigator - Abilene Christian University\r\n</title><link href="css/md0.css" type="text/css" rel="stylesheet" /><meta name="robots" content="noindex,nofollow"></meta><link href="css/collegenav.css?v=2.4" type="text/css" rel="stylesheet" /><link href="css/mapstyle.css" type="text/css" rel="stylesheet" /><!--[if lte IE 8]><link href="css/mainIE6.css" text="text/css" rel="stylesheet" /><![endif]--><link rel="search" type="application/opensearchdescription+xml" title="NCES College Navigator" href="http://nces.ed.gov/collegenavigator/opensearch.ashx?md=0" /></head>\r\n<body id="ctl00_bodyMain" onload="if(typeof imgPL==\'function\')imgPL();if(typeof sp==\'function\')sp();"><div class="headC

In [582]:
page = response.text

In [584]:
soup = BeautifulSoup(page, "lxml")


#### Initial table at top of institution's page
- Institution's name
- Student-to-faculty ratio

In [668]:
private_university_string = soup.find('title').text
private_university = private_university_string.split('-')[1].strip()
print(private_university)

Abilene Christian University


In [732]:
raw_stu_fac_ratio = soup.find('table', class_='layouttab').find_all('td')[-1].text
print(stu_fac_ratio)
stu_fac_ratio_ls = raw_stu_fac_ratio.split()
stu_fac_ratio_ls.remove('to')
stu_fac_ratio_ls = [int(x) for x in stu_fac_ratio_ls]
stu_fac_ratio_dec = stu_fac_ratio_ls[0] / stu_fac_ratio_ls[1]
print(stu_fac_ratio_dec)

13 to 1
13.0


#### 'TUITION, FEES AND ESTIMATED STUDENT EXPENSES' collapsable table
- tuition and fees in 2019-20
- books and supplies in 2019-20
- tuition and fees in 2020-21
- books and supplies in 2020-21

In [593]:
#tuition_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl00')
#tuition_rows = [row for row in tuition_table.find_all('tr')]
#tuition_rows[1]

<tr><td scope="row">Tuition and fees</td><td>$33,330</td><td>$34,850</td><td>$36,300</td><td>$37,800</td><td>4.1%</td></tr>

In [752]:
tuition_19_20 = soup.find(text='Tuition and fees').findNext().findNext().findNext().text
#tuition_19_20 = int(tuition_rows[1].find_all('td')[3].text.replace('$','').replace(',',''))
print(tuition_19_20)
books_19_20 = soup.find(text='Books and supplies').findNext().findNext().findNext().text
#books_19_20 = int(tuition_rows[2].find_all('td')[3].text.replace('$','').replace(',',''))
print(books_19_20)
tuition = soup.find(text='Tuition and fees').findNext().findNext().findNext().findNext().text
#tuition = int(tuition_rows[1].find_all('td')[4].text.replace('$','').replace(',',''))
print(tuition)
books = soup.find(text='Books and supplies').findNext().findNext().findNext().findNext().text
#books = int(tuition_rows[2].find_all('td')[4].text.replace('$','').replace(',',''))
print(books)

$36,300
$1,250
$37,800
$1,250


In [831]:
def get_tuition_books_19_20_value(soup, field_name):
    
    '''Grab tuition and fees and books and supplies values from 2019-20'''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [758]:
get_tuition_books_19_20_value(soup, 'Tuition and fees')

'$36,300'

In [757]:
get_tuition_books_19_20_value(soup, 'Books and supplies')

'$1,250'

In [832]:
def get_tuition_books_20_21_value(soup, field_name):
    
    '''Grab tuition and fees and books and supplies values from 20-21'''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [760]:
get_tuition_books_20_21_value(soup, 'Tuition and fees')

'$37,800'

In [761]:
get_tuition_books_20_21_value(soup, 'Books and supplies')

'$1,250'

#### 'GENERAL INFORMATION' collapsable table'
- number of full-time faculty
- number part-time faculty

In [594]:
#faculty_table = soup.find(id='ctl00_cphCollegeNavBody_ucInstitutionMain_divFaculty')
#faculty_table = [row for row in faculty_table.find_all('tr')]
#faculty_table[1]

<tr><td>Total faculty</td><td>277</td><td>137</td></tr>

In [602]:
ft_faculty = soup.find(text='Total faculty').findNext().text
#ft_faculty = int(faculty_table[1].find_all('td')[1].text)
pt_faculty = soup.find(text='Total faculty').findNext().findNext().text
#pt_faculty = int(faculty_table[1].find_all('td')[2].text)

In [833]:
def get_school_value_one_next(soup, field_name):
    
    '''Grab full-time faculty number, total enrollment, undergrad enrollment, undergrad transfer-in enrollment,
        grad enrollment, number of undergrad applicants, percent admitted
    '''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext()
    
    if element:
        return element.text 
    else:
        return None

In [771]:
get_school_value_one_next(soup, 'Total faculty')

'277'

In [834]:
def get_school_value_two_nexts(soup, field_name):
    
    '''Grab part-time faculty number, % of full-time beginning undergrad students receiving grant or 
        scholarship aid
    '''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [774]:
get_school_value_two_nexts(soup, 'Total faculty')

'137'

#### 'FINANCIAL AID' collapsable table - ONLY 2019-20 DATA
- number of all undergraduate students receiving grant or scholarship aid
- average amount of grant or scholarship aid among all undergraduate students

In [781]:
#aid_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl01')
#aid_rows = [row for row in aid_table.find_all('tr')]
#aid_rows[13].find_all('td')

[<td scope="row">Pell grants</td>,
 <td>899</td>,
 <td>$4,226,334</td>,
 <td>$4,701</td>]

In [782]:
#grant_schol_aid_19_20 = int(aid_rows[12].find_all('td')[1].text.replace(',',''))
#print(grant_scholarship_aid_19_20)
#avg_amt_grant_schol_aid_19_20 = int(aid_rows[12].find_all('td')[3].text.replace('$','').replace(',',''))
#print(avg_amt_grant_scholarship_aid_19_20)

3436
21092
899
4701


In [887]:
def get_avg_amt_grant_schol_aid_beg_under_value(soup, field_name):
    
    '''Grab the average amount of grant or scholarship aid a full-time beginning undergrad student receives'''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [888]:
get_avg_amt_grant_schol_aid_value(soup,'Grant or scholarship aid')

'$22,887'

In [889]:
def get_avg_amt_grant_schol_aid_all_under_value(soup, field_name):
    
    '''Grab the average amount of grant or scholarship aid among all undergrad students'''
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    # this works for most of the values
    element = obj.findNext().find_all('td')[3]
    
    if element:
        return element.text 
    else:
        return None

In [891]:
get_avg_amt_grant_schol_aid_all_under_value(soup, 'All Undergraduate Students')

'$21,092'

#### 'ENROLLMENT' collapsable table
- total enrollment
- undergraduate enrollment
- undergraduate transfer-in enrollment
- graduate enrollment

In [863]:
soup.find(text='Total enrollment').findNext()

<th scope="col">5,291</th>

In [617]:
#enroll_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')
#enroll_rows = [row for row in enroll_table.find_all('tr')]
#enroll_rows[0]

<tr><th scope="col">Total enrollment</th><th scope="col">5,291</th></tr>

In [622]:
#total_enroll = int(enroll_rows[0].find_all('th')[1].text.replace(',',''))
#print(total_enroll)
#under_enroll = int(enroll_rows[1].find_all('td')[1].text.replace(',',''))
#print(under_enroll)
#under_trans_enroll = int(enroll_rows[2].find_all('td')[1].text.replace(',',''))
#print(under_trans_enroll)
#grad_enroll = int(enroll_rows[3].find_all('td')[1].text.replace(',',''))
#print(grad_enroll)

5291
3494
143
1797


In [801]:
#def get_total_enroll_value(soup, element_id):
    
 #   '''Grab the total enrollment'''
    
 #  table = soup.find(id=element_id)
 
 #   if not table: 
 #       return None
    
    # this works for most of the values
 #   rows = [row for row in table.find_all('tr')]
    
 #   if rows:
 #       return rows[0].find_all('th')[1].text
 #   else:
 #       return None

In [802]:
#get_total_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

'5,291'

In [803]:
#def get_under_enroll_value(soup, element_id):
    
#    '''Grab the undergraduate enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[1].find_all('td')[1].text
#    else:
#        return None

In [804]:
#get_under_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

'3,494'

In [805]:
#def get_under_trans_enroll_value(soup, element_id):
    
#    '''Grab the undergraduate transfer-in enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[2].find_all('td')[1].text
#    else:
#        return None

In [806]:
#get_under_trans_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

'143'

In [807]:
#def get_grad_enroll_value(soup, element_id):
    
#    '''Grab the graduate enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[3].find_all('td')[1].text
#    else:
#        return None

In [808]:
#get_grad_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

'1,797'

#### 'ADMISSIONS' collapsable table - **NEED TO MODIFY THIS CODE FOR SCHOOLS LIKE AMERICA EVANGELICAL UNIVERSITY THAT DON'T HAVE ADMISSIONS DATA BECAUSE THEY HAVE AN OPEN ADMISSION POLICY**
- number of undergraduate applicants
- percent of undergraduate applicants admitted
- number of first-time students who submitted their SAT score
- % of first-time students who submitted their SAT score
- number of first-time students who submitted their ACT score
- % of first-time students who submitted their ACT score
- SAT Evidence-Based Reading and Writing 25th percentile score
- SAT Evidence-Based Reading and Writing 75th percentile score
- SAT Math 25th percentile score
- SAT Math 75th percentile score
- ACT Composite 25th percentile score
- ACT Composite 75th percentile score
- ACT English 25th percentile score
- ACT English 75th percentile score
- ACT Math 25th percentile score
- ACT Math 75th percentile score

In [659]:
#admissions_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl04')
#admissions_rows = [row for row in admissions_table.find_all('tr')]

In [693]:
#under_apps = int(admissions_rows[3].find_all('td')[1].text.replace(',',''))
#print(applicants)
#perc_admitted = float(admissions_rows[4].find_all('td')[2].text.replace('%',''))
#print(perc_admitted)

10534
61.0


In [705]:
soup.find(text='SAT Evidence-Based Reading and Writing').findNext().text

'510'

In [855]:
get_school_value_one_next(soup, 'Undergraduate enrollment')

'3,494'

In [819]:
get_school_value_two_nexts(soup, 'ACT')

'43%'

In [738]:
sat_scores_submit = int(soup.find(text='SAT').findNext().text)
print(sat_scores_submit)
perc_sat_scores_submit = float(soup.find(text='SAT').findNext().findNext().text.replace('%',''))
print(perc_sat_scores_submit)
act_scores_submit = int(soup.find(text='ACT').findNext().text)
print(act_scores_submit)
perc_act_scores_submit = float(soup.find(text='ACT').findNext().findNext().text.replace('%',''))
print(perc_act_scores_submit)
sat_read_write_25_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().text)
print(sat_read_write_25_perc)
sat_read_write_75_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().findNext().text)
print(sat_read_write_75_perc)
sat_math_25_perc = int(soup.find(text='SAT Math').findNext().text)
print(sat_math_25_perc)
sat_math_75_perc = int(soup.find(text='SAT Math').findNext().findNext().text)
print(sat_math_75_perc)
act_comp_25_perc = int(soup.find(text='ACT Composite').findNext().text)
print(act_comp_25_perc)
act_comp_75_perc = int(soup.find(text='ACT Composite').findNext().findNext().text)
print(act_comp_75_perc)
act_eng_25_perc = int(soup.find(text='ACT English').findNext().text)
print(act_eng_25_perc)
act_eng_75_perc = int(soup.find(text='ACT English').findNext().findNext().text)
print(act_eng_75_perc)
act_math_25_perc = int(soup.find(text='ACT Math').findNext().text)
print(act_math_25_perc)
act_math_75_perc = int(soup.find(text='ACT Math').findNext().findNext().text)
print(act_math_75_perc)

432
50.0
369
43.0
510
620
500
600
21
27
20
28
18
26


In [741]:
url_test = 'https://nces.ed.gov/collegenavigator/?s=KS&l=93&ct=2&id=155308#admsns' 

response_test = requests.get(url_test)

soup_test = BeautifulSoup(response_test.text, "lxml")

In [745]:
soup_test.find(text='SAT Evidence-Based Reading and Writing')

##### Step 2: create helper functions to parse strings into appropriate data types

In [883]:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

def percent_to_float(percentstring):
    try:
        percentstring = percentstring.replace('%','')
        return float(percentstring)
    except:
        return None

def number_to_int(numberstring):
    try:
        numberstring = numberstring.replace(',', '')
        return int(numberstring)
    except:
        return None

In [882]:
#tuition_19_20 = soup.find(text='Tuition and fees').findNext().findNext().findNext().text
#print(money_to_int(tuition_19_20))
#print(tuition_19_20)
#tuition_19_20 = int(tuition_rows[1].find_all('td')[3].text.replace('$','').replace(',',''))
#books_19_20 = soup.find(text='Books and supplies').findNext().findNext().findNext().text
#print(money_to_int(books_19_20))
#print(books_19_20)
#books_19_20 = int(tuition_rows[2].find_all('td')[3].text.replace('$','').replace(',',''))
#tuition = soup.find(text='Tuition and fees').findNext().findNext().findNext().findNext().text
#print(money_to_int(tuition))
#print(tuition)
#tuition = int(tuition_rows[1].find_all('td')[4].text.replace('$','').replace(',',''))
#books = soup.find(text='Books and supplies').findNext().findNext().findNext().findNext().text
#print(money_to_int(books))
#print(books)
#books = int(tuition_rows[2].find_all('td')[4].text.replace('$','').replace(',',''))

### Private non-profit colleges/university by region

In [379]:
# URLs of lists of schools by region
    # Far West
url_fw = 'https://nces.ed.gov/collegenavigator/?s=AK+CA+HI+NV+OR+WA&l=93&ct=2'

    # Rocky Mountains
url_rm = 'https://nces.ed.gov/collegenavigator/?s=CO+ID+MT+UT+WY&l=93&ct=2'

    # Southwest
url_sw = 'https://nces.ed.gov/collegenavigator/?s=AZ+NM+OK+TX&l=93&ct=2'

    # Plains
url_p = 'https://nces.ed.gov/collegenavigator/?s=IA+KS+MN+MO+NE+ND+SD&l=93&ct=2'

    # Great Lakes
url_gl = 'https://nces.ed.gov/collegenavigator/?s=IL+IN+MI+OH+WI&l=93&ct=2'

    # Southeast
url_se = 'https://nces.ed.gov/collegenavigator/?s=AL+AR+FL+GA+KY+LA+MS+NC+SC+TN+VA+WV&l=93&ct=2'
   
    # Mideast (including DC)
url_me = 'https://nces.ed.gov/collegenavigator/?s=DE+DC+MD+NJ+NY+PA&l=93&ct=2'
    
    # New England
url_ne = 'https://nces.ed.gov/collegenavigator/?s=CT+ME+MA+NH+RI+VT&l=93&ct=2'

In [504]:
# create a list of the url of each region's initial page of private schools

regions_urls = ['https://nces.ed.gov/collegenavigator/?s=AK+CA+HI+NV+OR+WA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CO+ID+MT+UT+WY&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AZ+NM+OK+TX&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IA+KS+MN+MO+NE+ND+SD&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IL+IN+MI+OH+WI&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AL+AR+FL+GA+KY+LA+MS+NC+SC+TN+VA+WV&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=DE+DC+MD+NJ+NY+PA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CT+ME+MA+NH+RI+VT&l=93&ct=2']                

In [427]:
# loop through each url of each region's initial page of private schools to get the count of the # of pages' 
    # worth of schools each region has

pages_regions = []

for url_reg in regions_urls:
    response_reg = requests.get(url_reg)
    soup_reg = BeautifulSoup(response_reg.text, 'html5lib')
    pages_str = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_divPagingControls').find_all('a')[-1]['href']
    pages_idx = pages_str.find('&pg=')
    pages = pages_str[pages_idx+4:]
    pages_regions.append(int(pages))

In [505]:
# loop through the list of initial region urls and each region's corresponding number of pages' worth of schools to
    # create a new url for each region's page(s) and add the new url to the original list of initial region urls

for idx in range(0, len(regions_urls)):
    for i in range(2, pages_regions[idx]+1):
        new_url = (''.join(regions_urls[idx] + '&pg=' + str(i)))
        regions_urls.append(new_url)

In [726]:
# loop through each region url to create a dictionary of all school names and their corresponding url

schools = {}
names = []
urls = []

for reg_url in regions_urls:
    response_reg = requests.get(reg_url)
    soup_reg = BeautifulSoup(response_reg.text, "lxml")
    table_reg = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_tblResults')
    rows_reg = [row for row in table_reg.find_all('tr')]

    for row in rows_reg:
        link = row.find('a')
        if link != None:
            name, url = link.text, link['href']
            names.append(name)
            urls.append(url)
            for idx in range(1, len(names)):
                if names[idx] == names[idx-1]:
                    names[idx] = names[idx] + '_b'

for a, b in zip(names, urls):
    schools[a] = [b]

In [730]:
# loop through the new dictionary of school names and urls to add beginning part of nces' website to the urls

#for key, val in schools.items():
 #   schools[key] = 'https://nces.ed.gov/collegenavigator/' + val

In [821]:
# check first five key, value pairs in schools dict

{k: schools[k] for k in list(schools)[:5]}

{'Alaska Bible College': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580'],
 'Alaska Pacific University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669'],
 'America Evangelical University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081'],
 'American Jewish University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846'],
 'Antioch University-Los Angeles': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838']}

In [822]:
schools_df = pd.DataFrame(schools).T  #transpose
schools_df.columns = ['link_stub']

schools_df.head()

Unnamed: 0,link_stub
Alaska Bible College,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580
Alaska Pacific University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669
America Evangelical University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081
American Jewish University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846
Antioch University-Los Angeles,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838


In [892]:
def get_school_dict(link):
    '''
    From the National Center for Educaiton Statistics link stub, request school html, parse with BeautifulSoup, and
    collect 
        - school name 
        - student-to-faculty ratio as string
        - tuition and fees in 2019-20
        - books and supplies in 2019-20
        - tuition and fees in 2020-21
        - books and supplies in 2020-21
        - number of full-time faculty
        - number part-time faculty
        - number of all undergraduate students receiving grant or scholarship aid
        - average amount of grant or scholarship aid among all undergraduate students
        - total enrollment
        - undergraduate enrollment
        - undergraduate transfer-in enrollment
        - graduate enrollment
        - number of undergraduate applicants
        - percent of undergraduate applicants admitted
        - number of first-time students who submitted their SAT score
        - % of first-time students who submitted their SAT score
        - number of first-time students who submitted their ACT score
        - % of first-time students who submitted their ACT score
        - SAT Evidence-Based Reading and Writing 25th percentile score
        - SAT Evidence-Based Reading and Writing 75th percentile score
        - SAT Math 25th percentile score
        - SAT Math 75th percentile score
        - ACT Composite 25th percentile score
        - ACT Composite 75th percentile score
        - ACT English 25th percentile score
        - ACT English 75th percentile score
        - ACT Math 25th percentile score
        - ACT Math 75th percentile score
    Return information as a dictionary.
    '''
    
    base_url = 'https://nces.ed.gov/collegenavigator/'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    time.sleep(3)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['school', 'raw_stu_to_fac_ratio', 'tuition_fees_19_20', 'books_supplies_19_20', 
               'tuition_fees', 'books_supplies','ft_faculty', 'pt_faculty','avg_amt_grant_schol_aid_beg_under_19_20',
               'perc_grant_schol_aid_beg_under_19_20', 'avg_amt_grant_schol_aid_all_under_19_20', 'total_enroll',
               'under_enroll', 'under_trans_enroll','grad_enroll', 'under_apps', 'perc_admitted',
               'sat_scores_submit', 'perc_sat_scores_submit','act_scores_submit','perc_act_scores_submit',
               'sat_read_write_25_perc', 'sat_read_write_75_perc','sat_math_25_perc','sat_math_75_perc', 
               'act_comp_25_perc','act_comp_75_perc', 'act_eng_25_perc','act_eng_75_perc', 'act_math_25_perc', 
               'act_math_75_perc']
    
    #Get school name
    school_string = soup.find('title').text
    school = school_string.split('-')[1].strip()

    #Get student to faculty ratio
    raw_stu_fac_ratio = soup.find('table', class_='layouttab').find_all('td')[-1].text
    
    #Get price of tuition and fees
    raw_tuition_fees_19_20 = get_tuition_books_19_20_value(soup, 'Tuition and fees')
    tuition_fees_19_20 = money_to_int(raw_tuition_fees_19_20)
    
    raw_tuition_fees = get_tuition_books_20_21_value(soup, 'Tuition and fees')
    tuition_fees = money_to_int(raw_tuition_fees)
    
    #Get price of books and supplies
    raw_books_supplies_19_20 = get_tuition_books_19_20_value(soup, 'Books and supplies')
    books_supplies_19_20 = money_to_int(raw_books_supplies_19_20)

    raw_books_supplies = get_tuition_books_20_21_value(soup, 'Books and supplies')
    books_supplies = money_to_int(raw_books_supplies)
    
    #Get number of full-time and part-time faculty
    raw_ft_faculty = get_school_value_one_next(soup, 'Total faculty')
    ft_faculty = number_to_int(raw_ft_faculty)
    
    raw_pt_faculty = get_school_value_two_nexts(soup, 'Total faculty')
    pt_faculty = number_to_int(raw_pt_faculty)

    #Get financial aid metrics
    raw_avg_amt_grant_schol_aid_beg_under_19_20 = get_avg_amt_grant_schol_aid_beg_under_value(soup,'Grant or scholarship aid')
    avg_amt_grant_schol_aid_beg_under_19_20 = money_to_int(raw_avg_amt_grant_schol_aid_beg_under_19_20)
        
    raw_perc_grant_schol_aid_beg_under_19_20 = get_school_value_two_nexts(soup,'Grant or scholarship aid')
    perc_grant_schol_aid_beg_under_19_20 = percent_to_float(raw_perc_grant_schol_aid_beg_under_19_20)
    
    raw_avg_amt_grant_schol_aid_all_under_19_20 = get_avg_amt_grant_schol_aid_all_under_value(soup, 'All Undergraduate Students')
    avg_amt_grant_schol_aid_all_under_19_20 = money_to_int(raw_avg_amt_grant_schol_aid_all_under_19_20)
    
    #Get enrollment metrics
    raw_total_enroll = get_school_value_one_next(soup, 'Total enrollment')
    total_enroll = number_to_int(raw_total_enroll)
    
    raw_under_enroll = get_school_value_one_next(soup, 'Undergraduate enrollment')
    under_enroll = number_to_int(raw_under_enroll)
    
    raw_under_trans_enroll = get_school_value_one_next(soup, 'Undergraduate transfer-in enrollment')
    under_trans_enroll = number_to_int(raw_under_trans_enroll)
    
    raw_grad_enroll = get_school_value_one_next(soup, 'Graduate enrollment')
    grad_enroll = number_to_int(raw_grad_enroll)
    
    #Get admissions metrics - # of applicants and % admitted
    raw_under_apps = get_school_value_one_next(soup, 'Number of applicants')
    under_apps = number_to_int(raw_under_apps)
    
    raw_perc_admitted = get_school_value_one_next(soup, 'Percent admitted')
    perc_admitted = percent_to_float(raw_perc_admitted)
    
    #Get admissions metrics - SAT and ACT
    raw_sat_scores_submit = get_school_value_one_next(soup, 'SAT')
    sat_scores_submit = number_to_int(raw_sat_scores_submit)
    
    raw_perc_sat_scores_submit = get_school_value_two_nexts(soup, 'SAT')
    perc_sat_scores_submit = percent_to_float(raw_perc_sat_scores_submit)
    
    raw_act_scores_submit = get_school_value_one_next(soup, 'ACT')
    act_scores_submit = number_to_int(raw_act_scores_submit)
    
    raw_perc_act_scores_submit = get_school_value_two_nexts(soup, 'ACT')
    perc_act_scores_submit = percent_to_float(raw_perc_act_scores_submit)
    
    raw_sat_read_write_25_perc = get_school_value_one_next(soup, 'SAT Evidence-Based Reading and Writing')
    sat_read_write_25_perc = number_to_int(raw_sat_read_write_25_perc)
    
    raw_sat_read_write_75_perc = get_school_value_two_nexts(soup, 'SAT Evidence-Based Reading and Writing')
    sat_read_write_75_perc = number_to_int(raw_sat_read_write_75_perc)
    
    raw_sat_math_25_perc = get_school_value_one_next(soup, 'SAT Math')
    sat_math_25_perc = number_to_int(raw_sat_math_25_perc)
    
    raw_sat_math_75_perc = get_school_value_two_nexts(soup, 'SAT Math')
    sat_math_75_perc = number_to_int(raw_sat_math_75_perc)
    
    raw_act_comp_25_perc = get_school_value_one_next(soup, 'ACT Composite')
    act_comp_25_perc = number_to_int(raw_act_comp_25_perc)
    
    raw_act_comp_75_perc = get_school_value_two_nexts(soup, 'ACT Composite')
    act_comp_75_perc = number_to_int(raw_act_comp_75_perc)
    
    raw_act_eng_25_perc = get_school_value_one_next(soup, 'ACT English')
    act_eng_25_perc = number_to_int(raw_act_eng_25_perc)
    
    raw_act_eng_75_perc = get_school_value_two_nexts(soup, 'ACT English')
    act_eng_75_perc = number_to_int(raw_act_eng_75_perc)
    
    raw_act_math_25_perc = get_school_value_one_next(soup, 'ACT Math')
    act_math_25_perc = number_to_int(raw_act_math_25_perc)
    
    raw_act_math_75_perc = get_school_value_two_nexts(soup, 'ACT Math')
    act_math_75_perc = number_to_int(raw_act_math_75_perc)
    
    #Create school dictionary and return
    school_dict = dict(zip(headers, [school,
                                    raw_stu_fac_ratio,
                                    tuition_fees_19_20,
                                    books_supplies_19_20, 
                                    tuition_fees,
                                    books_supplies,
                                    ft_faculty,
                                    pt_faculty,
                                    avg_amt_grant_schol_aid_beg_under_19_20,
                                    perc_grant_schol_aid_beg_under_19_20,
                                    avg_amt_grant_schol_aid_all_under_19_20,
                                    total_enroll,
                                    under_enroll, 
                                    under_trans_enroll, 
                                    grad_enroll,
                                    under_apps,
                                    perc_admitted,
                                    sat_scores_submit,
                                    perc_sat_scores_submit,
                                    act_scores_submit,
                                    perc_act_scores_submit,
                                    sat_read_write_25_perc,
                                    sat_read_write_75_perc,
                                    sat_math_25_perc,
                                    sat_math_75_perc,
                                    act_comp_25_perc,
                                    act_comp_75_perc,
                                    act_eng_25_perc,
                                    act_eng_75_perc,
                                    act_math_25_perc,
                                    act_math_75_perc]))

    return school_dict

In [893]:
private_schools_page_info_list = []

for link in schools_df.link_stub:
    private_schools_page_info_list.append(get_school_dict(link))

In [894]:
private_schools_page_info_list[:5]

[{'school': 'Alaska Bible College',
  'raw_stu_to_fac_ratio': '6 to 1',
  'tuition_fees_19_20': 9700,
  'books_supplies_19_20': 400,
  'tuition_fees': 9700,
  'books_supplies': 400,
  'ft_faculty': 0,
  'pt_faculty': 7,
  'avg_amt_grant_schol_aid_beg_under_19_20': 11015,
  'perc_grant_schol_aid_beg_under_19_20': 100.0,
  'avg_amt_grant_schol_aid_all_under_19_20': 6902,
  'total_enroll': None,
  'under_enroll': None,
  'under_trans_enroll': 3,
  'grad_enroll': None,
  'under_apps': None,
  'perc_admitted': None,
  'sat_scores_submit': None,
  'perc_sat_scores_submit': None,
  'act_scores_submit': None,
  'perc_act_scores_submit': None,
  'sat_read_write_25_perc': None,
  'sat_read_write_75_perc': None,
  'sat_math_25_perc': None,
  'sat_math_75_perc': None,
  'act_comp_25_perc': None,
  'act_comp_75_perc': None,
  'act_eng_25_perc': None,
  'act_eng_75_perc': None,
  'act_math_25_perc': None,
  'act_math_75_perc': None},
 {'school': 'Alaska Pacific University',
  'raw_stu_to_fac_ratio':

In [895]:
private_schools_page_info = pd.DataFrame(private_schools_page_info_list)  #convert list of dict to df
private_schools_page_info.set_index('school', inplace=True)

In [896]:
under_apps_not_na = private_schools_page_info.loc[~private_schools_page_info.under_apps.isna()]

337

In [897]:
private_schools_page_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1332 entries, Alaska Bible College to Yale University
Data columns (total 30 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   raw_stu_to_fac_ratio                     1332 non-null   object 
 1   tuition_fees_19_20                       1261 non-null   float64
 2   books_supplies_19_20                     1209 non-null   float64
 3   tuition_fees                             1264 non-null   float64
 4   books_supplies                           1214 non-null   float64
 5   ft_faculty                               1316 non-null   float64
 6   pt_faculty                               1316 non-null   float64
 7   avg_amt_grant_schol_aid_beg_under_19_20  1281 non-null   float64
 8   perc_grant_schol_aid_beg_under_19_20     1320 non-null   float64
 9   avg_amt_grant_schol_aid_all_under_19_20  1318 non-null   float64
 10  total_enroll           

In [574]:
# loop through each school's page to extract the student-to-faculty ratio and save the value in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [576]:
#stu_fac_ratio_ls[]

1332

In [577]:
#len(schools)

1332

In [None]:
#time.sleep(3)

In [None]:
# loop through each school's page to grab the html from that page and save it in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [None]:
# code for looping through one state's url and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_string = 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2'

#for abrev in state_abrev:
   # ca_list = list(ca_string)
   # idx_1 = ca_list.index('=')
   # idx_2 = ca_list.index('&')
    #print(''.join(ca_list[:idx_1+1]) + abrev + ''.join(ca_list[idx_2:]))

In [523]:
# code for looping through the multiple pages of a state's urls and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_schools_search_tables_urls = [
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=3',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=4',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=5',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=6',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=7',
  #  'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=8']

#for abrev in state_abrev:
  #  for url in ca_schools_search_tables_urls:
   #     idx_1 = url.index('=')
   #     idx_2 = url.index('&')
  #      print(''.join(url[:idx_1+1]) + abrev + ''.join(url[idx_2:]))